mlrun 1.7.0rc5__py3-none-any.whl → 1.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +39 -121
- mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
- mlrun/alerts/alert.py +248 -0
- mlrun/api/schemas/__init__.py +4 -3
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +39 -254
- mlrun/artifacts/dataset.py +9 -190
- mlrun/artifacts/manager.py +73 -46
- mlrun/artifacts/model.py +30 -158
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +73 -2
- mlrun/common/db/sql_session.py +3 -2
- mlrun/common/formatters/__init__.py +21 -0
- mlrun/common/formatters/artifact.py +46 -0
- mlrun/common/formatters/base.py +113 -0
- mlrun/common/formatters/feature_set.py +44 -0
- mlrun/common/formatters/function.py +46 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/common/formatters/run.py +29 -0
- mlrun/common/helpers.py +11 -1
- mlrun/{runtimes → common/runtimes}/constants.py +32 -4
- mlrun/common/schemas/__init__.py +21 -4
- mlrun/common/schemas/alert.py +202 -0
- mlrun/common/schemas/api_gateway.py +113 -2
- mlrun/common/schemas/artifact.py +28 -1
- mlrun/common/schemas/auth.py +11 -0
- mlrun/common/schemas/client_spec.py +2 -1
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/constants.py +3 -0
- mlrun/common/schemas/feature_store.py +58 -28
- mlrun/common/schemas/frontend_spec.py +8 -0
- mlrun/common/schemas/function.py +11 -0
- mlrun/common/schemas/hub.py +7 -9
- mlrun/common/schemas/model_monitoring/__init__.py +21 -4
- mlrun/common/schemas/model_monitoring/constants.py +136 -42
- mlrun/common/schemas/model_monitoring/grafana.py +9 -5
- mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
- mlrun/common/schemas/notification.py +69 -12
- mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
- mlrun/common/schemas/pipeline.py +7 -0
- mlrun/common/schemas/project.py +67 -16
- mlrun/common/schemas/runs.py +17 -0
- mlrun/common/schemas/schedule.py +1 -1
- mlrun/common/schemas/workflow.py +10 -2
- mlrun/common/types.py +14 -1
- mlrun/config.py +224 -58
- mlrun/data_types/data_types.py +11 -1
- mlrun/data_types/spark.py +5 -4
- mlrun/data_types/to_pandas.py +75 -34
- mlrun/datastore/__init__.py +8 -10
- mlrun/datastore/alibaba_oss.py +131 -0
- mlrun/datastore/azure_blob.py +131 -43
- mlrun/datastore/base.py +107 -47
- mlrun/datastore/datastore.py +17 -7
- mlrun/datastore/datastore_profile.py +91 -7
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -3
- mlrun/datastore/google_cloud_storage.py +92 -32
- mlrun/datastore/hdfs.py +5 -0
- mlrun/datastore/inmem.py +6 -3
- mlrun/datastore/redis.py +3 -2
- mlrun/datastore/s3.py +30 -12
- mlrun/datastore/snowflake_utils.py +45 -0
- mlrun/datastore/sources.py +274 -59
- mlrun/datastore/spark_utils.py +30 -0
- mlrun/datastore/store_resources.py +9 -7
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +374 -102
- mlrun/datastore/utils.py +68 -5
- mlrun/datastore/v3io.py +28 -50
- mlrun/db/auth_utils.py +152 -0
- mlrun/db/base.py +231 -22
- mlrun/db/factory.py +1 -4
- mlrun/db/httpdb.py +864 -228
- mlrun/db/nopdb.py +268 -16
- mlrun/errors.py +35 -5
- mlrun/execution.py +111 -38
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +46 -53
- mlrun/feature_store/common.py +6 -11
- mlrun/feature_store/feature_set.py +48 -23
- mlrun/feature_store/feature_vector.py +13 -2
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +9 -4
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +13 -4
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +24 -32
- mlrun/feature_store/steps.py +38 -19
- mlrun/features.py +6 -14
- mlrun/frameworks/_common/plan.py +3 -3
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
- mlrun/frameworks/lgbm/__init__.py +1 -1
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/model_handler.py +1 -1
- mlrun/frameworks/parallel_coordinates.py +4 -4
- mlrun/frameworks/pytorch/__init__.py +2 -2
- mlrun/frameworks/sklearn/__init__.py +1 -1
- mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
- mlrun/frameworks/tf_keras/__init__.py +5 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
- mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
- mlrun/frameworks/xgboost/__init__.py +1 -1
- mlrun/k8s_utils.py +57 -12
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +6 -5
- mlrun/launcher/client.py +13 -11
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +15 -5
- mlrun/launcher/remote.py +10 -3
- mlrun/lists.py +6 -2
- mlrun/model.py +297 -48
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +152 -357
- mlrun/model_monitoring/applications/__init__.py +10 -0
- mlrun/model_monitoring/applications/_application_steps.py +190 -0
- mlrun/model_monitoring/applications/base.py +108 -0
- mlrun/model_monitoring/applications/context.py +341 -0
- mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
- mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +130 -303
- mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
- mlrun/model_monitoring/db/stores/__init__.py +136 -0
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/db/stores/base/store.py +213 -0
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
- mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
- mlrun/model_monitoring/db/tsdb/base.py +448 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
- mlrun/model_monitoring/features_drift_table.py +34 -22
- mlrun/model_monitoring/helpers.py +177 -39
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +165 -398
- mlrun/model_monitoring/tracking_policy.py +7 -1
- mlrun/model_monitoring/writer.py +161 -125
- mlrun/package/packagers/default_packager.py +2 -2
- mlrun/package/packagers_manager.py +1 -0
- mlrun/package/utils/_formatter.py +2 -2
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +67 -228
- mlrun/projects/__init__.py +6 -1
- mlrun/projects/operations.py +47 -20
- mlrun/projects/pipelines.py +396 -249
- mlrun/projects/project.py +1125 -414
- mlrun/render.py +28 -22
- mlrun/run.py +207 -180
- mlrun/runtimes/__init__.py +76 -11
- mlrun/runtimes/base.py +40 -14
- mlrun/runtimes/daskjob.py +9 -2
- mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -29
- mlrun/runtimes/kubejob.py +34 -128
- mlrun/runtimes/local.py +39 -10
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +8 -8
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/runtimes/nuclio/api_gateway.py +646 -177
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +758 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/nuclio/function.py +188 -68
- mlrun/runtimes/nuclio/serving.py +57 -60
- mlrun/runtimes/pod.py +191 -58
- mlrun/runtimes/remotesparkjob.py +11 -8
- mlrun/runtimes/sparkjob/spark3job.py +17 -18
- mlrun/runtimes/utils.py +40 -73
- mlrun/secrets.py +6 -2
- mlrun/serving/__init__.py +8 -1
- mlrun/serving/remote.py +2 -3
- mlrun/serving/routers.py +89 -64
- mlrun/serving/server.py +54 -26
- mlrun/serving/states.py +187 -56
- mlrun/serving/utils.py +19 -11
- mlrun/serving/v2_serving.py +136 -63
- mlrun/track/tracker.py +2 -1
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/async_http.py +26 -6
- mlrun/utils/db.py +18 -0
- mlrun/utils/helpers.py +375 -105
- mlrun/utils/http.py +2 -2
- mlrun/utils/logger.py +75 -9
- mlrun/utils/notifications/notification/__init__.py +14 -10
- mlrun/utils/notifications/notification/base.py +48 -0
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +24 -1
- mlrun/utils/notifications/notification/ipython.py +2 -0
- mlrun/utils/notifications/notification/slack.py +96 -21
- mlrun/utils/notifications/notification/webhook.py +63 -2
- mlrun/utils/notifications/notification_pusher.py +146 -16
- mlrun/utils/regex.py +9 -0
- mlrun/utils/retryer.py +3 -2
- mlrun/utils/v3io_clients.py +2 -3
- mlrun/utils/version/version.json +2 -2
- mlrun-1.7.2.dist-info/METADATA +390 -0
- mlrun-1.7.2.dist-info/RECORD +351 -0
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -271
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/application.py +0 -310
- mlrun/model_monitoring/batch.py +0 -974
- mlrun/model_monitoring/controller_handler.py +0 -37
- mlrun/model_monitoring/prometheus.py +0 -216
- mlrun/model_monitoring/stores/__init__.py +0 -111
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
- mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/models/base.py +0 -84
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
- mlrun/platforms/other.py +0 -305
- mlrun-1.7.0rc5.dist-info/METADATA +0 -269
- mlrun-1.7.0rc5.dist-info/RECORD +0 -323
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0
mlrun/model_monitoring/batch.py
DELETED
|
@@ -1,974 +0,0 @@
|
|
|
1
|
-
# Copyright 2023 Iguazio
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
import collections
|
|
16
|
-
import datetime
|
|
17
|
-
import json
|
|
18
|
-
import os
|
|
19
|
-
import re
|
|
20
|
-
from typing import Any, Optional, Union
|
|
21
|
-
|
|
22
|
-
import numpy as np
|
|
23
|
-
import pandas as pd
|
|
24
|
-
import requests
|
|
25
|
-
import v3io
|
|
26
|
-
import v3io.dataplane
|
|
27
|
-
import v3io_frames
|
|
28
|
-
from v3io_frames.frames_pb2 import IGNORE
|
|
29
|
-
|
|
30
|
-
import mlrun.common.helpers
|
|
31
|
-
import mlrun.common.model_monitoring.helpers
|
|
32
|
-
import mlrun.common.schemas.model_monitoring
|
|
33
|
-
import mlrun.data_types.infer
|
|
34
|
-
import mlrun.feature_store as fstore
|
|
35
|
-
import mlrun.utils.v3io_clients
|
|
36
|
-
from mlrun.model_monitoring.metrics.histogram_distance import (
|
|
37
|
-
HellingerDistance,
|
|
38
|
-
HistogramDistanceMetric,
|
|
39
|
-
KullbackLeiblerDivergence,
|
|
40
|
-
TotalVarianceDistance,
|
|
41
|
-
)
|
|
42
|
-
from mlrun.utils import logger
|
|
43
|
-
|
|
44
|
-
# A type for representing a drift result, a tuple of the status and the drift mean:
|
|
45
|
-
DriftResultType = tuple[mlrun.common.schemas.model_monitoring.DriftStatus, float]
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
class VirtualDrift:
|
|
49
|
-
"""
|
|
50
|
-
Virtual Drift object is used for handling the drift calculations.
|
|
51
|
-
It contains the metrics objects and the related methods for the detection of potential drift.
|
|
52
|
-
"""
|
|
53
|
-
|
|
54
|
-
def __init__(
|
|
55
|
-
self,
|
|
56
|
-
prediction_col: Optional[str] = None,
|
|
57
|
-
label_col: Optional[str] = None,
|
|
58
|
-
feature_weights: Optional[list[float]] = None,
|
|
59
|
-
inf_capping: Optional[float] = 10,
|
|
60
|
-
):
|
|
61
|
-
"""
|
|
62
|
-
Initialize a Virtual Drift object.
|
|
63
|
-
|
|
64
|
-
:param prediction_col: The name of the dataframe column which represents the predictions of the model. If
|
|
65
|
-
provided, it will be used for calculating drift over the predictions. The name of the
|
|
66
|
-
dataframe column which represents the labels of the model. If provided, it will be used
|
|
67
|
-
for calculating drift over the labels.
|
|
68
|
-
:param feature_weights: Weights that can be applied to the features and to be considered during the drift
|
|
69
|
-
analysis.
|
|
70
|
-
:param inf_capping: A bounded value for the results of the statistical metric. For example, when calculating
|
|
71
|
-
KL divergence and getting infinite distance between the two distributions, the result
|
|
72
|
-
will be replaced with the capping value.
|
|
73
|
-
"""
|
|
74
|
-
self.prediction_col = prediction_col
|
|
75
|
-
self.label_col = label_col
|
|
76
|
-
self.feature_weights = feature_weights
|
|
77
|
-
self.capping = inf_capping
|
|
78
|
-
|
|
79
|
-
# Initialize objects of the current metrics
|
|
80
|
-
self.metrics: dict[str, type[HistogramDistanceMetric]] = {
|
|
81
|
-
metric_class.NAME: metric_class
|
|
82
|
-
for metric_class in (
|
|
83
|
-
TotalVarianceDistance,
|
|
84
|
-
HellingerDistance,
|
|
85
|
-
KullbackLeiblerDivergence,
|
|
86
|
-
)
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
@staticmethod
|
|
90
|
-
def dict_to_histogram(histogram_dict: dict[str, dict[str, Any]]) -> pd.DataFrame:
|
|
91
|
-
"""
|
|
92
|
-
Convert histogram dictionary to pandas DataFrame with feature histograms as columns
|
|
93
|
-
|
|
94
|
-
:param histogram_dict: Histogram dictionary
|
|
95
|
-
|
|
96
|
-
:returns: Histogram dataframe
|
|
97
|
-
"""
|
|
98
|
-
|
|
99
|
-
# Create a dictionary with feature histograms as values
|
|
100
|
-
histograms = {}
|
|
101
|
-
for feature, stats in histogram_dict.items():
|
|
102
|
-
if "hist" in stats:
|
|
103
|
-
# Normalize to probability distribution of each feature
|
|
104
|
-
histograms[feature] = np.array(stats["hist"][0]) / stats["count"]
|
|
105
|
-
|
|
106
|
-
# Convert the dictionary to pandas DataFrame
|
|
107
|
-
histograms = pd.DataFrame(histograms)
|
|
108
|
-
|
|
109
|
-
return histograms
|
|
110
|
-
|
|
111
|
-
def compute_metrics_over_df(
|
|
112
|
-
self,
|
|
113
|
-
base_histogram: dict[str, dict[str, Any]],
|
|
114
|
-
latest_histogram: dict[str, dict[str, Any]],
|
|
115
|
-
) -> dict[str, dict[str, Any]]:
|
|
116
|
-
"""
|
|
117
|
-
Calculate metrics values for each feature.
|
|
118
|
-
|
|
119
|
-
For example:
|
|
120
|
-
{tvd: {feature_1: 0.001, feature_2: 0.2: ,...}}
|
|
121
|
-
|
|
122
|
-
:param base_histogram: histogram dataframe that represents the distribution of the features from the original
|
|
123
|
-
training set.
|
|
124
|
-
:param latest_histogram: Histogram dataframe that represents the distribution of the features from the latest
|
|
125
|
-
input batch.
|
|
126
|
-
|
|
127
|
-
:returns: A dictionary in which for each metric (key) we assign the values for each feature.
|
|
128
|
-
"""
|
|
129
|
-
|
|
130
|
-
# compute the different metrics for each feature distribution and store the results in dictionary
|
|
131
|
-
drift_measures = {}
|
|
132
|
-
for metric_name, metric in self.metrics.items():
|
|
133
|
-
drift_measures[metric_name] = {
|
|
134
|
-
feature: metric(
|
|
135
|
-
base_histogram.loc[:, feature], latest_histogram.loc[:, feature]
|
|
136
|
-
).compute()
|
|
137
|
-
for feature in base_histogram
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
return drift_measures
|
|
141
|
-
|
|
142
|
-
def compute_drift_from_histograms(
|
|
143
|
-
self,
|
|
144
|
-
feature_stats: dict[str, dict[str, Any]],
|
|
145
|
-
current_stats: dict[str, dict[str, Any]],
|
|
146
|
-
) -> dict[str, dict[str, Any]]:
|
|
147
|
-
"""
|
|
148
|
-
Compare the distributions of both the original features data and the latest input data
|
|
149
|
-
:param feature_stats: Histogram dictionary of the original feature dataset that was used in the model training.
|
|
150
|
-
:param current_stats: Histogram dictionary of the recent input data
|
|
151
|
-
|
|
152
|
-
:returns: A dictionary that includes the drift results for each feature.
|
|
153
|
-
|
|
154
|
-
"""
|
|
155
|
-
|
|
156
|
-
# convert histogram dictionaries to DataFrame of the histograms
|
|
157
|
-
# with feature histogram as cols
|
|
158
|
-
base_histogram = self.dict_to_histogram(feature_stats)
|
|
159
|
-
latest_histogram = self.dict_to_histogram(current_stats)
|
|
160
|
-
|
|
161
|
-
# verify all the features exist between datasets
|
|
162
|
-
base_features = set(base_histogram.columns)
|
|
163
|
-
latest_features = set(latest_histogram.columns)
|
|
164
|
-
features_common = list(base_features.intersection(latest_features))
|
|
165
|
-
feature_difference = list(base_features ^ latest_features)
|
|
166
|
-
if not features_common:
|
|
167
|
-
raise ValueError(
|
|
168
|
-
f"No common features found: {base_features} <> {latest_features}"
|
|
169
|
-
)
|
|
170
|
-
|
|
171
|
-
# drop columns of non-exist features
|
|
172
|
-
base_histogram = base_histogram.drop(
|
|
173
|
-
feature_difference, axis=1, errors="ignore"
|
|
174
|
-
)
|
|
175
|
-
latest_histogram = latest_histogram.drop(
|
|
176
|
-
feature_difference, axis=1, errors="ignore"
|
|
177
|
-
)
|
|
178
|
-
|
|
179
|
-
# compute the statistical metrics per feature
|
|
180
|
-
features_drift_measures = self.compute_metrics_over_df(
|
|
181
|
-
base_histogram.loc[:, features_common],
|
|
182
|
-
latest_histogram.loc[:, features_common],
|
|
183
|
-
)
|
|
184
|
-
|
|
185
|
-
# compute total value for each metric
|
|
186
|
-
for metric_name in self.metrics.keys():
|
|
187
|
-
feature_values = list(features_drift_measures[metric_name].values())
|
|
188
|
-
features_drift_measures[metric_name]["total_sum"] = np.sum(feature_values)
|
|
189
|
-
features_drift_measures[metric_name]["total_mean"] = np.mean(feature_values)
|
|
190
|
-
|
|
191
|
-
# add weighted mean by given feature weights if provided
|
|
192
|
-
if self.feature_weights:
|
|
193
|
-
features_drift_measures[metric_name]["total_weighted_mean"] = np.dot(
|
|
194
|
-
feature_values, self.feature_weights
|
|
195
|
-
)
|
|
196
|
-
|
|
197
|
-
# define drift result dictionary with values as a dictionary
|
|
198
|
-
drift_result = collections.defaultdict(dict)
|
|
199
|
-
|
|
200
|
-
# fill drift result dictionary with the statistical metrics results per feature
|
|
201
|
-
# and the total sum and mean of each metric
|
|
202
|
-
for feature in features_common:
|
|
203
|
-
for metric, values in features_drift_measures.items():
|
|
204
|
-
drift_result[feature][metric] = values[feature]
|
|
205
|
-
sum = features_drift_measures[metric]["total_sum"]
|
|
206
|
-
mean = features_drift_measures[metric]["total_mean"]
|
|
207
|
-
drift_result[f"{metric}_sum"] = sum
|
|
208
|
-
drift_result[f"{metric}_mean"] = mean
|
|
209
|
-
if self.feature_weights:
|
|
210
|
-
metric_measure = features_drift_measures[metric]
|
|
211
|
-
weighted_mean = metric_measure["total_weighted_mean"]
|
|
212
|
-
drift_result[f"{metric}_weighted_mean"] = weighted_mean
|
|
213
|
-
|
|
214
|
-
# compute the drift metric over the labels
|
|
215
|
-
if self.label_col:
|
|
216
|
-
label_drift_measures = self.compute_metrics_over_df(
|
|
217
|
-
base_histogram.loc[:, self.label_col],
|
|
218
|
-
latest_histogram.loc[:, self.label_col],
|
|
219
|
-
)
|
|
220
|
-
for metric, values in label_drift_measures.items():
|
|
221
|
-
drift_result[self.label_col][metric] = values[metric]
|
|
222
|
-
|
|
223
|
-
# compute the drift metric over the predictions
|
|
224
|
-
if self.prediction_col:
|
|
225
|
-
prediction_drift_measures = self.compute_metrics_over_df(
|
|
226
|
-
base_histogram.loc[:, self.prediction_col],
|
|
227
|
-
latest_histogram.loc[:, self.prediction_col],
|
|
228
|
-
)
|
|
229
|
-
for metric, values in prediction_drift_measures.items():
|
|
230
|
-
drift_result[self.prediction_col][metric] = values[metric]
|
|
231
|
-
|
|
232
|
-
return drift_result
|
|
233
|
-
|
|
234
|
-
@staticmethod
|
|
235
|
-
def check_for_drift_per_feature(
|
|
236
|
-
metrics_results_dictionary: dict[str, Union[float, dict]],
|
|
237
|
-
possible_drift_threshold: float = 0.5,
|
|
238
|
-
drift_detected_threshold: float = 0.7,
|
|
239
|
-
) -> dict[str, DriftResultType]:
|
|
240
|
-
"""
|
|
241
|
-
Check for drift based on the defined decision rule and the calculated results of the statistical metrics per
|
|
242
|
-
feature.
|
|
243
|
-
|
|
244
|
-
:param metrics_results_dictionary: Dictionary of statistical metrics results per feature and the total means of
|
|
245
|
-
all features.
|
|
246
|
-
:param possible_drift_threshold: Threshold for the calculated result to be in a possible drift status.
|
|
247
|
-
Default: 0.5.
|
|
248
|
-
:param drift_detected_threshold: Threshold for the calculated result to be in a drift detected status.
|
|
249
|
-
Default: 0.7.
|
|
250
|
-
|
|
251
|
-
:returns: A dictionary of all the features and their drift status and results tuples, tuple of:
|
|
252
|
-
[0] = Drift status enum based on the thresholds given.
|
|
253
|
-
[1] = The drift result (float) based on the mean of the Total Variance Distance and the Hellinger
|
|
254
|
-
distance.
|
|
255
|
-
"""
|
|
256
|
-
# Initialize the drift results dictionary:
|
|
257
|
-
drift_results = {}
|
|
258
|
-
|
|
259
|
-
# Calculate the result per feature:
|
|
260
|
-
for feature, results in metrics_results_dictionary.items():
|
|
261
|
-
# A feature result must be a dictionary, otherwise it's the total mean (float):
|
|
262
|
-
if not isinstance(results, dict):
|
|
263
|
-
continue
|
|
264
|
-
# Calculate the feature's drift mean:
|
|
265
|
-
tvd = results[TotalVarianceDistance.NAME]
|
|
266
|
-
hellinger = results[HellingerDistance.NAME]
|
|
267
|
-
if tvd is None or hellinger is None:
|
|
268
|
-
logger.warning(
|
|
269
|
-
"Can't calculate drift for this feature because at least one of the required "
|
|
270
|
-
"statistical metrics is missing",
|
|
271
|
-
feature=feature,
|
|
272
|
-
tvd=tvd,
|
|
273
|
-
hellinger=hellinger,
|
|
274
|
-
)
|
|
275
|
-
continue
|
|
276
|
-
metrics_results_dictionary = (tvd + hellinger) / 2
|
|
277
|
-
# Decision rule for drift detection:
|
|
278
|
-
drift_status = VirtualDrift._get_drift_status(
|
|
279
|
-
drift_result=metrics_results_dictionary,
|
|
280
|
-
possible_drift_threshold=possible_drift_threshold,
|
|
281
|
-
drift_detected_threshold=drift_detected_threshold,
|
|
282
|
-
)
|
|
283
|
-
# Collect the drift result:
|
|
284
|
-
drift_results[feature] = (drift_status, metrics_results_dictionary)
|
|
285
|
-
|
|
286
|
-
return drift_results
|
|
287
|
-
|
|
288
|
-
@staticmethod
|
|
289
|
-
def check_for_drift(
|
|
290
|
-
metrics_results_dictionary: dict[str, Union[float, dict]],
|
|
291
|
-
possible_drift_threshold: float = 0.5,
|
|
292
|
-
drift_detected_threshold: float = 0.7,
|
|
293
|
-
) -> DriftResultType:
|
|
294
|
-
"""
|
|
295
|
-
Check for drift based on the defined decision rule and the calculated results of the statistical metrics by the
|
|
296
|
-
mean of all features.
|
|
297
|
-
|
|
298
|
-
:param metrics_results_dictionary: Dictionary of statistical metrics results per feature and the total means of
|
|
299
|
-
all features.
|
|
300
|
-
:param possible_drift_threshold: Threshold for the calculated result to be in a possible drift status.
|
|
301
|
-
Default: 0.5.
|
|
302
|
-
:param drift_detected_threshold: Threshold for the calculated result to be in a drift detected status.
|
|
303
|
-
Default: 0.7.
|
|
304
|
-
|
|
305
|
-
:returns: A tuple of:
|
|
306
|
-
[0] = Drift status enum based on the thresholds given.
|
|
307
|
-
[1] = The drift result (float) based on the mean of the Total Variance Distance and the Hellinger
|
|
308
|
-
distance.
|
|
309
|
-
"""
|
|
310
|
-
# Calculate the mean drift result:
|
|
311
|
-
tvd_mean = metrics_results_dictionary[f"{TotalVarianceDistance.NAME}_mean"]
|
|
312
|
-
hellinger_mean = metrics_results_dictionary.get(
|
|
313
|
-
f"{HellingerDistance.NAME}_mean"
|
|
314
|
-
)
|
|
315
|
-
drift_result = 0.0
|
|
316
|
-
if tvd_mean and hellinger_mean:
|
|
317
|
-
drift_result = (tvd_mean + hellinger_mean) / 2
|
|
318
|
-
|
|
319
|
-
# Decision rule for drift detection:
|
|
320
|
-
drift_status = VirtualDrift._get_drift_status(
|
|
321
|
-
drift_result=drift_result,
|
|
322
|
-
possible_drift_threshold=possible_drift_threshold,
|
|
323
|
-
drift_detected_threshold=drift_detected_threshold,
|
|
324
|
-
)
|
|
325
|
-
|
|
326
|
-
return drift_status, drift_result
|
|
327
|
-
|
|
328
|
-
@staticmethod
|
|
329
|
-
def _get_drift_status(
|
|
330
|
-
drift_result: float,
|
|
331
|
-
possible_drift_threshold: float,
|
|
332
|
-
drift_detected_threshold: float,
|
|
333
|
-
) -> mlrun.common.schemas.model_monitoring.DriftStatus:
|
|
334
|
-
"""
|
|
335
|
-
Get the drift status according to the result and thresholds given.
|
|
336
|
-
|
|
337
|
-
:param drift_result: The drift result.
|
|
338
|
-
:param possible_drift_threshold: Threshold for the calculated result to be in a possible drift status.
|
|
339
|
-
:param drift_detected_threshold: Threshold for the calculated result to be in a drift detected status.
|
|
340
|
-
|
|
341
|
-
:returns: The figured drift status.
|
|
342
|
-
"""
|
|
343
|
-
drift_status = mlrun.common.schemas.model_monitoring.DriftStatus.NO_DRIFT
|
|
344
|
-
if drift_result >= drift_detected_threshold:
|
|
345
|
-
drift_status = (
|
|
346
|
-
mlrun.common.schemas.model_monitoring.DriftStatus.DRIFT_DETECTED
|
|
347
|
-
)
|
|
348
|
-
elif drift_result >= possible_drift_threshold:
|
|
349
|
-
drift_status = (
|
|
350
|
-
mlrun.common.schemas.model_monitoring.DriftStatus.POSSIBLE_DRIFT
|
|
351
|
-
)
|
|
352
|
-
|
|
353
|
-
return drift_status
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
def calculate_inputs_statistics(
|
|
357
|
-
sample_set_statistics: dict, inputs: pd.DataFrame
|
|
358
|
-
) -> dict:
|
|
359
|
-
"""
|
|
360
|
-
Calculate the inputs data statistics for drift monitoring purpose.
|
|
361
|
-
|
|
362
|
-
:param sample_set_statistics: The sample set (stored end point's dataset to reference) statistics. The bins of the
|
|
363
|
-
histograms of each feature will be used to recalculate the histograms of the inputs.
|
|
364
|
-
:param inputs: The inputs to calculate their statistics and later on - the drift with respect to the
|
|
365
|
-
sample set.
|
|
366
|
-
|
|
367
|
-
:returns: The calculated statistics of the inputs data.
|
|
368
|
-
"""
|
|
369
|
-
|
|
370
|
-
# Use `DFDataInfer` to calculate the statistics over the inputs:
|
|
371
|
-
inputs_statistics = mlrun.data_types.infer.DFDataInfer.get_stats(
|
|
372
|
-
df=inputs,
|
|
373
|
-
options=mlrun.data_types.infer.InferOptions.Histogram,
|
|
374
|
-
)
|
|
375
|
-
|
|
376
|
-
# Recalculate the histograms over the bins that are set in the sample-set of the end point:
|
|
377
|
-
for feature in inputs_statistics.keys():
|
|
378
|
-
if feature in sample_set_statistics:
|
|
379
|
-
counts, bins = np.histogram(
|
|
380
|
-
inputs[feature].to_numpy(),
|
|
381
|
-
bins=sample_set_statistics[feature]["hist"][1],
|
|
382
|
-
)
|
|
383
|
-
inputs_statistics[feature]["hist"] = [
|
|
384
|
-
counts.tolist(),
|
|
385
|
-
bins.tolist(),
|
|
386
|
-
]
|
|
387
|
-
elif "hist" in inputs_statistics[feature]:
|
|
388
|
-
# Comply with the other common features' histogram length
|
|
389
|
-
mlrun.common.model_monitoring.helpers.pad_hist(
|
|
390
|
-
mlrun.common.model_monitoring.helpers.Histogram(
|
|
391
|
-
inputs_statistics[feature]["hist"]
|
|
392
|
-
)
|
|
393
|
-
)
|
|
394
|
-
|
|
395
|
-
return inputs_statistics
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
class BatchProcessor:
|
|
399
|
-
"""
|
|
400
|
-
The main object to handle the batch processing job. This object is used to get the required configurations and
|
|
401
|
-
to manage the main monitoring drift detection process based on the current batch.
|
|
402
|
-
Note that the BatchProcessor object requires access keys along with valid project configurations.
|
|
403
|
-
"""
|
|
404
|
-
|
|
405
|
-
def __init__(
|
|
406
|
-
self,
|
|
407
|
-
context: mlrun.run.MLClientCtx,
|
|
408
|
-
project: str,
|
|
409
|
-
):
|
|
410
|
-
"""
|
|
411
|
-
Initialize Batch Processor object.
|
|
412
|
-
|
|
413
|
-
:param context: An MLRun context.
|
|
414
|
-
:param project: Project name.
|
|
415
|
-
"""
|
|
416
|
-
self.context = context
|
|
417
|
-
self.project = project
|
|
418
|
-
|
|
419
|
-
# Initialize virtual drift object
|
|
420
|
-
self.virtual_drift = VirtualDrift(inf_capping=10)
|
|
421
|
-
|
|
422
|
-
logger.info(
|
|
423
|
-
"Initializing BatchProcessor",
|
|
424
|
-
project=project,
|
|
425
|
-
)
|
|
426
|
-
|
|
427
|
-
# Get drift thresholds from the model monitoring configuration
|
|
428
|
-
# fmt: off
|
|
429
|
-
self.default_possible_drift_threshold = (
|
|
430
|
-
mlrun.mlconf.model_endpoint_monitoring.drift_thresholds.default.possible_drift
|
|
431
|
-
)
|
|
432
|
-
self.default_drift_detected_threshold = (
|
|
433
|
-
mlrun.mlconf.model_endpoint_monitoring.drift_thresholds.default.drift_detected
|
|
434
|
-
)
|
|
435
|
-
# fmt: on
|
|
436
|
-
|
|
437
|
-
# Get a runtime database
|
|
438
|
-
|
|
439
|
-
self.db = mlrun.model_monitoring.get_model_endpoint_store(project=project)
|
|
440
|
-
|
|
441
|
-
if not mlrun.mlconf.is_ce_mode():
|
|
442
|
-
# TODO: Once there is a time series DB alternative in a non-CE deployment, we need to update this if
|
|
443
|
-
# statement to be applied only for V3IO TSDB
|
|
444
|
-
self._initialize_v3io_configurations()
|
|
445
|
-
|
|
446
|
-
# If an error occurs, it will be raised using the following argument
|
|
447
|
-
self.exception = None
|
|
448
|
-
|
|
449
|
-
# Get the batch interval range
|
|
450
|
-
self.batch_dict = context.parameters[
|
|
451
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.BATCH_INTERVALS_DICT
|
|
452
|
-
]
|
|
453
|
-
|
|
454
|
-
# TODO: This will be removed in 1.5.0 once the job params can be parsed with different types
|
|
455
|
-
# Convert batch dict string into a dictionary
|
|
456
|
-
if isinstance(self.batch_dict, str):
|
|
457
|
-
self._parse_batch_dict_str()
|
|
458
|
-
|
|
459
|
-
# If provided, only model endpoints in that that list will be analyzed
|
|
460
|
-
self.model_endpoints = context.parameters.get(
|
|
461
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.MODEL_ENDPOINTS, None
|
|
462
|
-
)
|
|
463
|
-
|
|
464
|
-
def _initialize_v3io_configurations(self):
|
|
465
|
-
self.v3io_access_key = os.environ.get("V3IO_ACCESS_KEY")
|
|
466
|
-
self.model_monitoring_access_key = (
|
|
467
|
-
os.environ.get("MODEL_MONITORING_ACCESS_KEY") or self.v3io_access_key
|
|
468
|
-
)
|
|
469
|
-
|
|
470
|
-
# Define the required paths for the project objects
|
|
471
|
-
tsdb_path = mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
472
|
-
project=self.project,
|
|
473
|
-
kind=mlrun.common.schemas.model_monitoring.FileTargetKind.EVENTS,
|
|
474
|
-
)
|
|
475
|
-
(
|
|
476
|
-
_,
|
|
477
|
-
self.tsdb_container,
|
|
478
|
-
self.tsdb_path,
|
|
479
|
-
) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
|
|
480
|
-
tsdb_path
|
|
481
|
-
)
|
|
482
|
-
# stream_path = template.format(project=self.project, kind="log_stream")
|
|
483
|
-
stream_path = mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
484
|
-
project=self.project,
|
|
485
|
-
kind=mlrun.common.schemas.model_monitoring.FileTargetKind.LOG_STREAM,
|
|
486
|
-
)
|
|
487
|
-
(
|
|
488
|
-
_,
|
|
489
|
-
self.stream_container,
|
|
490
|
-
self.stream_path,
|
|
491
|
-
) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
|
|
492
|
-
stream_path
|
|
493
|
-
)
|
|
494
|
-
|
|
495
|
-
# Get the frames clients based on the v3io configuration
|
|
496
|
-
# it will be used later for writing the results into the tsdb
|
|
497
|
-
self.v3io = mlrun.utils.v3io_clients.get_v3io_client(
|
|
498
|
-
access_key=self.v3io_access_key
|
|
499
|
-
)
|
|
500
|
-
self.frames = mlrun.utils.v3io_clients.get_frames_client(
|
|
501
|
-
address=mlrun.mlconf.v3io_framesd,
|
|
502
|
-
container=self.tsdb_container,
|
|
503
|
-
token=self.v3io_access_key,
|
|
504
|
-
)
|
|
505
|
-
logger.info(
|
|
506
|
-
"Creating table in TSDB if it does not already exist", table=self.tsdb_path
|
|
507
|
-
)
|
|
508
|
-
self.frames.create(
|
|
509
|
-
backend="tsdb",
|
|
510
|
-
table=self.tsdb_path,
|
|
511
|
-
if_exists=IGNORE,
|
|
512
|
-
rate="1/s",
|
|
513
|
-
)
|
|
514
|
-
|
|
515
|
-
def post_init(self):
|
|
516
|
-
"""
|
|
517
|
-
Preprocess of the batch processing.
|
|
518
|
-
"""
|
|
519
|
-
|
|
520
|
-
if not mlrun.mlconf.is_ce_mode():
|
|
521
|
-
# Create v3io stream based on the input stream
|
|
522
|
-
response = self.v3io.stream.create(
|
|
523
|
-
container=self.stream_container,
|
|
524
|
-
stream_path=self.stream_path,
|
|
525
|
-
shard_count=1,
|
|
526
|
-
raise_for_status=v3io.dataplane.RaiseForStatus.never,
|
|
527
|
-
access_key=self.v3io_access_key,
|
|
528
|
-
)
|
|
529
|
-
|
|
530
|
-
if not (
|
|
531
|
-
response.status_code == 400 and "ResourceInUse" in str(response.body)
|
|
532
|
-
):
|
|
533
|
-
response.raise_for_status([409, 204, 403])
|
|
534
|
-
pass
|
|
535
|
-
|
|
536
|
-
def run(self):
|
|
537
|
-
"""
|
|
538
|
-
Main method for manage the drift analysis and write the results into tsdb and KV table.
|
|
539
|
-
"""
|
|
540
|
-
# Get model endpoints (each deployed project has at least 1 serving model):
|
|
541
|
-
|
|
542
|
-
try:
|
|
543
|
-
endpoints = self.db.list_model_endpoints(uids=self.model_endpoints)
|
|
544
|
-
|
|
545
|
-
except Exception as e:
|
|
546
|
-
logger.error("Failed to list endpoints", exc=e)
|
|
547
|
-
return
|
|
548
|
-
|
|
549
|
-
for endpoint in endpoints:
|
|
550
|
-
if (
|
|
551
|
-
endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.ACTIVE]
|
|
552
|
-
and endpoint[
|
|
553
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.MONITORING_MODE
|
|
554
|
-
]
|
|
555
|
-
== mlrun.common.schemas.model_monitoring.ModelMonitoringMode.enabled.value
|
|
556
|
-
):
|
|
557
|
-
# Skip router endpoint:
|
|
558
|
-
if (
|
|
559
|
-
int(
|
|
560
|
-
endpoint[
|
|
561
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_TYPE
|
|
562
|
-
]
|
|
563
|
-
)
|
|
564
|
-
== mlrun.common.schemas.model_monitoring.EndpointType.ROUTER
|
|
565
|
-
):
|
|
566
|
-
# Router endpoint has no feature stats
|
|
567
|
-
logger.info(
|
|
568
|
-
f"{endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.UID]} is router skipping"
|
|
569
|
-
)
|
|
570
|
-
continue
|
|
571
|
-
self.update_drift_metrics(endpoint=endpoint)
|
|
572
|
-
|
|
573
|
-
def update_drift_metrics(self, endpoint: dict):
|
|
574
|
-
try:
|
|
575
|
-
m_fs = fstore.get_feature_set(
|
|
576
|
-
endpoint[
|
|
577
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.FEATURE_SET_URI
|
|
578
|
-
]
|
|
579
|
-
)
|
|
580
|
-
|
|
581
|
-
# Getting batch interval start time and end time
|
|
582
|
-
start_time, end_time = self._get_interval_range()
|
|
583
|
-
|
|
584
|
-
try:
|
|
585
|
-
df = m_fs.to_dataframe(
|
|
586
|
-
start_time=start_time,
|
|
587
|
-
end_time=end_time,
|
|
588
|
-
time_column=mlrun.common.schemas.model_monitoring.EventFieldType.TIMESTAMP,
|
|
589
|
-
)
|
|
590
|
-
|
|
591
|
-
if len(df) == 0:
|
|
592
|
-
logger.warn(
|
|
593
|
-
"Not enough model events since the beginning of the batch interval",
|
|
594
|
-
parquet_target=m_fs.status.targets[0].path,
|
|
595
|
-
endpoint=endpoint[
|
|
596
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.UID
|
|
597
|
-
],
|
|
598
|
-
min_rqeuired_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
|
|
599
|
-
start_time=str(
|
|
600
|
-
datetime.datetime.now() - datetime.timedelta(hours=1)
|
|
601
|
-
),
|
|
602
|
-
end_time=str(datetime.datetime.now()),
|
|
603
|
-
)
|
|
604
|
-
return
|
|
605
|
-
|
|
606
|
-
# TODO: The below warn will be removed once the state of the Feature Store target is updated
|
|
607
|
-
# as expected. In that case, the existence of the file will be checked before trying to get
|
|
608
|
-
# the offline data from the feature set.
|
|
609
|
-
# Continue if not enough events provided since the deployment of the model endpoint
|
|
610
|
-
except FileNotFoundError:
|
|
611
|
-
logger.warn(
|
|
612
|
-
"Parquet not found, probably due to not enough model events",
|
|
613
|
-
parquet_target=m_fs.status.targets[0].path,
|
|
614
|
-
endpoint=endpoint[
|
|
615
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.UID
|
|
616
|
-
],
|
|
617
|
-
min_rqeuired_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
|
|
618
|
-
)
|
|
619
|
-
return
|
|
620
|
-
|
|
621
|
-
# Get feature names from monitoring feature set
|
|
622
|
-
feature_names = [
|
|
623
|
-
feature_name["name"] for feature_name in m_fs.spec.features.to_dict()
|
|
624
|
-
]
|
|
625
|
-
|
|
626
|
-
# Create DataFrame based on the input features
|
|
627
|
-
stats_columns = [
|
|
628
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.TIMESTAMP,
|
|
629
|
-
*feature_names,
|
|
630
|
-
]
|
|
631
|
-
|
|
632
|
-
# Add label names if provided
|
|
633
|
-
if endpoint[
|
|
634
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.LABEL_NAMES
|
|
635
|
-
]:
|
|
636
|
-
labels = endpoint[
|
|
637
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.LABEL_NAMES
|
|
638
|
-
]
|
|
639
|
-
if isinstance(labels, str):
|
|
640
|
-
labels = json.loads(labels)
|
|
641
|
-
for label in labels:
|
|
642
|
-
if label not in stats_columns:
|
|
643
|
-
stats_columns.append(label)
|
|
644
|
-
named_features_df = df[stats_columns].copy()
|
|
645
|
-
|
|
646
|
-
# Infer feature set stats and schema
|
|
647
|
-
fstore.api._infer_from_static_df(
|
|
648
|
-
named_features_df,
|
|
649
|
-
m_fs,
|
|
650
|
-
options=mlrun.data_types.infer.InferOptions.all_stats(),
|
|
651
|
-
)
|
|
652
|
-
|
|
653
|
-
# Save feature set to apply changes
|
|
654
|
-
m_fs.save()
|
|
655
|
-
|
|
656
|
-
# Get the timestamp of the latest request:
|
|
657
|
-
timestamp = df[
|
|
658
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.TIMESTAMP
|
|
659
|
-
].iloc[-1]
|
|
660
|
-
|
|
661
|
-
# Get the feature stats from the model endpoint for reference data
|
|
662
|
-
feature_stats = json.loads(
|
|
663
|
-
endpoint[
|
|
664
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.FEATURE_STATS
|
|
665
|
-
]
|
|
666
|
-
)
|
|
667
|
-
# Pad the original feature stats to accommodate current data out
|
|
668
|
-
# of the original range (unless already padded)
|
|
669
|
-
mlrun.common.model_monitoring.helpers.pad_features_hist(
|
|
670
|
-
mlrun.common.model_monitoring.helpers.FeatureStats(feature_stats)
|
|
671
|
-
)
|
|
672
|
-
|
|
673
|
-
# Get the current stats:
|
|
674
|
-
current_stats = calculate_inputs_statistics(
|
|
675
|
-
sample_set_statistics=feature_stats,
|
|
676
|
-
inputs=named_features_df,
|
|
677
|
-
)
|
|
678
|
-
|
|
679
|
-
# Compute the drift based on the histogram of the current stats and the histogram of the original
|
|
680
|
-
# feature stats that can be found in the model endpoint object:
|
|
681
|
-
drift_result = self.virtual_drift.compute_drift_from_histograms(
|
|
682
|
-
feature_stats=feature_stats,
|
|
683
|
-
current_stats=current_stats,
|
|
684
|
-
)
|
|
685
|
-
logger.info("Drift result", drift_result=drift_result)
|
|
686
|
-
|
|
687
|
-
# Get drift thresholds from the model configuration:
|
|
688
|
-
monitor_configuration = (
|
|
689
|
-
json.loads(
|
|
690
|
-
endpoint[
|
|
691
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.MONITOR_CONFIGURATION
|
|
692
|
-
]
|
|
693
|
-
)
|
|
694
|
-
or {}
|
|
695
|
-
)
|
|
696
|
-
|
|
697
|
-
# For backwards compatibility first check if the old drift thresholds
|
|
698
|
-
# (both `possible drift and `drift_detected`) keys exist in the monitor configuration dict
|
|
699
|
-
# TODO: Remove the first get in 1.7.0
|
|
700
|
-
possible_drift = monitor_configuration.get(
|
|
701
|
-
"possible_drift",
|
|
702
|
-
monitor_configuration.get(
|
|
703
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.POSSIBLE_DRIFT_THRESHOLD,
|
|
704
|
-
self.default_possible_drift_threshold,
|
|
705
|
-
),
|
|
706
|
-
)
|
|
707
|
-
|
|
708
|
-
drift_detected = monitor_configuration.get(
|
|
709
|
-
"drift_detected",
|
|
710
|
-
monitor_configuration.get(
|
|
711
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.DRIFT_DETECTED_THRESHOLD,
|
|
712
|
-
self.default_drift_detected_threshold,
|
|
713
|
-
),
|
|
714
|
-
)
|
|
715
|
-
|
|
716
|
-
# Check for possible drift based on the results of the statistical metrics defined above:
|
|
717
|
-
drift_status, drift_measure = self.virtual_drift.check_for_drift(
|
|
718
|
-
metrics_results_dictionary=drift_result,
|
|
719
|
-
possible_drift_threshold=possible_drift,
|
|
720
|
-
drift_detected_threshold=drift_detected,
|
|
721
|
-
)
|
|
722
|
-
logger.info(
|
|
723
|
-
"Drift status",
|
|
724
|
-
endpoint_id=endpoint[
|
|
725
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.UID
|
|
726
|
-
],
|
|
727
|
-
drift_status=drift_status.value,
|
|
728
|
-
drift_measure=drift_measure,
|
|
729
|
-
)
|
|
730
|
-
|
|
731
|
-
attributes = {
|
|
732
|
-
"current_stats": json.dumps(current_stats),
|
|
733
|
-
"drift_measures": json.dumps(drift_result),
|
|
734
|
-
"drift_status": drift_status.value,
|
|
735
|
-
}
|
|
736
|
-
|
|
737
|
-
self.db.update_model_endpoint(
|
|
738
|
-
endpoint_id=endpoint[
|
|
739
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.UID
|
|
740
|
-
],
|
|
741
|
-
attributes=attributes,
|
|
742
|
-
)
|
|
743
|
-
|
|
744
|
-
if not mlrun.mlconf.is_ce_mode():
|
|
745
|
-
# Generate V3IO KV schema if not exist
|
|
746
|
-
self._infer_kv_schema()
|
|
747
|
-
|
|
748
|
-
# Update drift results in TSDB
|
|
749
|
-
self._update_drift_in_v3io_tsdb(
|
|
750
|
-
endpoint_id=endpoint[
|
|
751
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.UID
|
|
752
|
-
],
|
|
753
|
-
drift_status=drift_status,
|
|
754
|
-
drift_measure=drift_measure,
|
|
755
|
-
drift_result=drift_result,
|
|
756
|
-
timestamp=timestamp,
|
|
757
|
-
)
|
|
758
|
-
|
|
759
|
-
else:
|
|
760
|
-
# Update drift results in Prometheus
|
|
761
|
-
self._update_drift_in_prometheus(
|
|
762
|
-
endpoint_id=endpoint[
|
|
763
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.UID
|
|
764
|
-
],
|
|
765
|
-
drift_status=drift_status,
|
|
766
|
-
drift_result=drift_result,
|
|
767
|
-
)
|
|
768
|
-
|
|
769
|
-
except Exception as e:
|
|
770
|
-
logger.error(
|
|
771
|
-
f"Exception for endpoint {endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.UID]}"
|
|
772
|
-
)
|
|
773
|
-
self.exception = e
|
|
774
|
-
logger.info(
|
|
775
|
-
"Done updating drift measures",
|
|
776
|
-
endpoint_id=endpoint[
|
|
777
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.UID
|
|
778
|
-
],
|
|
779
|
-
)
|
|
780
|
-
|
|
781
|
-
def _get_interval_range(self) -> tuple[datetime.datetime, datetime.datetime]:
|
|
782
|
-
"""Getting batch interval time range"""
|
|
783
|
-
minutes, hours, days = (
|
|
784
|
-
self.batch_dict[
|
|
785
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.MINUTES
|
|
786
|
-
],
|
|
787
|
-
self.batch_dict[mlrun.common.schemas.model_monitoring.EventFieldType.HOURS],
|
|
788
|
-
self.batch_dict[mlrun.common.schemas.model_monitoring.EventFieldType.DAYS],
|
|
789
|
-
)
|
|
790
|
-
start_time = datetime.datetime.now() - datetime.timedelta(
|
|
791
|
-
minutes=minutes, hours=hours, days=days
|
|
792
|
-
)
|
|
793
|
-
end_time = datetime.datetime.now()
|
|
794
|
-
return start_time, end_time
|
|
795
|
-
|
|
796
|
-
def _parse_batch_dict_str(self):
|
|
797
|
-
"""Convert batch dictionary string into a valid dictionary"""
|
|
798
|
-
characters_to_remove = "{} "
|
|
799
|
-
pattern = "[" + characters_to_remove + "]"
|
|
800
|
-
# Remove unnecessary characters from the provided string
|
|
801
|
-
batch_list = re.sub(pattern, "", self.batch_dict).split(",")
|
|
802
|
-
# Initialize the dictionary of batch interval ranges
|
|
803
|
-
self.batch_dict = {}
|
|
804
|
-
for pair in batch_list:
|
|
805
|
-
pair_list = pair.split(":")
|
|
806
|
-
self.batch_dict[pair_list[0]] = float(pair_list[1])
|
|
807
|
-
|
|
808
|
-
def _update_drift_in_v3io_tsdb(
|
|
809
|
-
self,
|
|
810
|
-
endpoint_id: str,
|
|
811
|
-
drift_status: mlrun.common.schemas.model_monitoring.DriftStatus,
|
|
812
|
-
drift_measure: float,
|
|
813
|
-
drift_result: dict[str, dict[str, Any]],
|
|
814
|
-
timestamp: pd.Timestamp,
|
|
815
|
-
):
|
|
816
|
-
"""Update drift results in input stream.
|
|
817
|
-
|
|
818
|
-
:param endpoint_id: The unique id of the model endpoint.
|
|
819
|
-
:param drift_status: Drift status result. Possible values can be found under DriftStatus enum class.
|
|
820
|
-
:param drift_measure: The drift result (float) based on the mean of the Total Variance Distance and the
|
|
821
|
-
Hellinger distance.
|
|
822
|
-
:param drift_result: A dictionary that includes the drift results for each feature.
|
|
823
|
-
:param timestamp: Pandas Timestamp value.
|
|
824
|
-
|
|
825
|
-
"""
|
|
826
|
-
|
|
827
|
-
if (
|
|
828
|
-
drift_status
|
|
829
|
-
== mlrun.common.schemas.model_monitoring.DriftStatus.POSSIBLE_DRIFT
|
|
830
|
-
or drift_status
|
|
831
|
-
== mlrun.common.schemas.model_monitoring.DriftStatus.DRIFT_DETECTED
|
|
832
|
-
):
|
|
833
|
-
self.v3io.stream.put_records(
|
|
834
|
-
container=self.stream_container,
|
|
835
|
-
stream_path=self.stream_path,
|
|
836
|
-
records=[
|
|
837
|
-
{
|
|
838
|
-
"data": json.dumps(
|
|
839
|
-
{
|
|
840
|
-
"endpoint_id": endpoint_id,
|
|
841
|
-
"drift_status": drift_status.value,
|
|
842
|
-
"drift_measure": drift_measure,
|
|
843
|
-
"drift_per_feature": {**drift_result},
|
|
844
|
-
}
|
|
845
|
-
)
|
|
846
|
-
}
|
|
847
|
-
],
|
|
848
|
-
)
|
|
849
|
-
|
|
850
|
-
# Update the results in tsdb:
|
|
851
|
-
tsdb_drift_measures = {
|
|
852
|
-
"endpoint_id": endpoint_id,
|
|
853
|
-
"timestamp": timestamp,
|
|
854
|
-
"record_type": "drift_measures",
|
|
855
|
-
"tvd_mean": drift_result["tvd_mean"],
|
|
856
|
-
"kld_mean": drift_result["kld_mean"],
|
|
857
|
-
"hellinger_mean": drift_result["hellinger_mean"],
|
|
858
|
-
}
|
|
859
|
-
|
|
860
|
-
try:
|
|
861
|
-
self.frames.write(
|
|
862
|
-
backend="tsdb",
|
|
863
|
-
table=self.tsdb_path,
|
|
864
|
-
dfs=pd.DataFrame.from_records([tsdb_drift_measures]),
|
|
865
|
-
index_cols=["timestamp", "endpoint_id", "record_type"],
|
|
866
|
-
)
|
|
867
|
-
except v3io_frames.errors.Error as err:
|
|
868
|
-
logger.warn(
|
|
869
|
-
"Could not write drift measures to TSDB",
|
|
870
|
-
err=err,
|
|
871
|
-
tsdb_path=self.tsdb_path,
|
|
872
|
-
endpoint=endpoint_id,
|
|
873
|
-
)
|
|
874
|
-
|
|
875
|
-
def _update_drift_in_prometheus(
|
|
876
|
-
self,
|
|
877
|
-
endpoint_id: str,
|
|
878
|
-
drift_status: mlrun.common.schemas.model_monitoring.DriftStatus,
|
|
879
|
-
drift_result: dict[str, dict[str, Any]],
|
|
880
|
-
):
|
|
881
|
-
"""Push drift metrics to Prometheus registry. Please note that the metrics are being pushed through HTTP
|
|
882
|
-
to the monitoring stream pod that writes them into a local registry. Afterwards, Prometheus wil scrape these
|
|
883
|
-
metrics that will be available in the Grafana charts.
|
|
884
|
-
|
|
885
|
-
:param endpoint_id: The unique id of the model endpoint.
|
|
886
|
-
:param drift_status: Drift status result. Possible values can be found under DriftStatus enum class.
|
|
887
|
-
:param drift_result: A dictionary that includes the drift results for each feature.
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
"""
|
|
891
|
-
stream_http_path = (
|
|
892
|
-
mlrun.mlconf.model_endpoint_monitoring.default_http_sink.format(
|
|
893
|
-
project=self.project, namespace=mlrun.mlconf.namespace
|
|
894
|
-
)
|
|
895
|
-
)
|
|
896
|
-
|
|
897
|
-
http_session = mlrun.utils.HTTPSessionWithRetry(
|
|
898
|
-
retry_on_post=True,
|
|
899
|
-
verbose=True,
|
|
900
|
-
max_retries=1,
|
|
901
|
-
)
|
|
902
|
-
try:
|
|
903
|
-
# Model monitoring stream http health check
|
|
904
|
-
http_session.request("GET", url=stream_http_path)
|
|
905
|
-
|
|
906
|
-
# Update statistical metrics
|
|
907
|
-
statistical_metrics = ["hellinger_mean", "tvd_mean", "kld_mean"]
|
|
908
|
-
metrics = []
|
|
909
|
-
for metric in statistical_metrics:
|
|
910
|
-
metrics.append(
|
|
911
|
-
{
|
|
912
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID: endpoint_id,
|
|
913
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.METRIC: metric,
|
|
914
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.VALUE: drift_result[
|
|
915
|
-
metric
|
|
916
|
-
],
|
|
917
|
-
}
|
|
918
|
-
)
|
|
919
|
-
|
|
920
|
-
http_session.request(
|
|
921
|
-
method="POST",
|
|
922
|
-
url=stream_http_path + "/monitoring-batch-metrics",
|
|
923
|
-
data=json.dumps(metrics),
|
|
924
|
-
)
|
|
925
|
-
|
|
926
|
-
# Update drift status
|
|
927
|
-
drift_status_dict = {
|
|
928
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID: endpoint_id,
|
|
929
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.DRIFT_STATUS: drift_status.value,
|
|
930
|
-
}
|
|
931
|
-
|
|
932
|
-
http_session.request(
|
|
933
|
-
method="POST",
|
|
934
|
-
url=stream_http_path + "/monitoring-drift-status",
|
|
935
|
-
data=json.dumps(drift_status_dict),
|
|
936
|
-
)
|
|
937
|
-
|
|
938
|
-
except requests.exceptions.ConnectionError as exc:
|
|
939
|
-
logger.warning(
|
|
940
|
-
"Can't push metrics to Prometheus registry. "
|
|
941
|
-
"Monitoring stream pod is not found, probably not deployed. "
|
|
942
|
-
"To deploy, call set_tracking() on a serving function. exc: ",
|
|
943
|
-
exc=exc,
|
|
944
|
-
)
|
|
945
|
-
|
|
946
|
-
def _infer_kv_schema(self):
|
|
947
|
-
"""
|
|
948
|
-
Create KV schema file if not exist. This schema is being used by the Grafana dashboards.
|
|
949
|
-
"""
|
|
950
|
-
|
|
951
|
-
schema_file = self.db.client.kv.new_cursor(
|
|
952
|
-
container=self.db.container,
|
|
953
|
-
table_path=self.db.path,
|
|
954
|
-
filter_expression='__name==".#schema"',
|
|
955
|
-
)
|
|
956
|
-
|
|
957
|
-
if not schema_file.all():
|
|
958
|
-
logger.info(
|
|
959
|
-
"Generate a new V3IO KV schema file", kv_table_path=self.db.path
|
|
960
|
-
)
|
|
961
|
-
self.frames.execute(
|
|
962
|
-
backend="kv", table=self.db.path, command="infer_schema"
|
|
963
|
-
)
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
def handler(context: mlrun.run.MLClientCtx):
|
|
967
|
-
batch_processor = BatchProcessor(
|
|
968
|
-
context=context,
|
|
969
|
-
project=context.project,
|
|
970
|
-
)
|
|
971
|
-
batch_processor.post_init()
|
|
972
|
-
batch_processor.run()
|
|
973
|
-
if batch_processor.exception:
|
|
974
|
-
raise batch_processor.exception
|