mlrun 1.7.0rc3__py3-none-any.whl → 1.7.0rc5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/artifacts/manager.py +6 -1
- mlrun/common/constants.py +2 -0
- mlrun/common/model_monitoring/helpers.py +12 -6
- mlrun/common/schemas/__init__.py +11 -0
- mlrun/common/schemas/api_gateway.py +85 -0
- mlrun/common/schemas/auth.py +2 -2
- mlrun/common/schemas/client_spec.py +1 -0
- mlrun/common/schemas/common.py +40 -0
- mlrun/common/schemas/model_monitoring/constants.py +4 -1
- mlrun/common/schemas/project.py +2 -0
- mlrun/config.py +31 -17
- mlrun/datastore/azure_blob.py +22 -9
- mlrun/datastore/base.py +15 -25
- mlrun/datastore/datastore.py +19 -8
- mlrun/datastore/datastore_profile.py +47 -5
- mlrun/datastore/google_cloud_storage.py +10 -6
- mlrun/datastore/hdfs.py +51 -0
- mlrun/datastore/redis.py +4 -0
- mlrun/datastore/s3.py +4 -0
- mlrun/datastore/sources.py +29 -43
- mlrun/datastore/targets.py +59 -53
- mlrun/datastore/utils.py +2 -49
- mlrun/datastore/v3io.py +4 -0
- mlrun/db/base.py +50 -0
- mlrun/db/httpdb.py +121 -50
- mlrun/db/nopdb.py +13 -0
- mlrun/execution.py +3 -3
- mlrun/feature_store/feature_vector.py +2 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +3 -3
- mlrun/frameworks/tf_keras/model_handler.py +7 -7
- mlrun/k8s_utils.py +10 -5
- mlrun/kfpops.py +19 -10
- mlrun/model.py +5 -0
- mlrun/model_monitoring/api.py +3 -3
- mlrun/model_monitoring/application.py +1 -1
- mlrun/model_monitoring/applications/__init__.py +13 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +218 -0
- mlrun/model_monitoring/batch.py +9 -111
- mlrun/model_monitoring/controller.py +73 -55
- mlrun/model_monitoring/controller_handler.py +13 -5
- mlrun/model_monitoring/features_drift_table.py +62 -53
- mlrun/model_monitoring/helpers.py +30 -21
- mlrun/model_monitoring/metrics/__init__.py +13 -0
- mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +14 -14
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -1
- mlrun/package/packagers/pandas_packagers.py +3 -3
- mlrun/package/utils/_archiver.py +3 -1
- mlrun/platforms/iguazio.py +8 -65
- mlrun/projects/pipelines.py +21 -11
- mlrun/projects/project.py +180 -42
- mlrun/run.py +1 -1
- mlrun/runtimes/base.py +25 -2
- mlrun/runtimes/kubejob.py +5 -3
- mlrun/runtimes/local.py +2 -2
- mlrun/runtimes/mpijob/abstract.py +6 -6
- mlrun/runtimes/nuclio/__init__.py +1 -0
- mlrun/runtimes/nuclio/api_gateway.py +300 -0
- mlrun/runtimes/nuclio/function.py +9 -9
- mlrun/runtimes/nuclio/serving.py +3 -3
- mlrun/runtimes/pod.py +3 -3
- mlrun/runtimes/sparkjob/spark3job.py +3 -3
- mlrun/serving/remote.py +4 -2
- mlrun/serving/server.py +2 -8
- mlrun/utils/async_http.py +3 -3
- mlrun/utils/helpers.py +27 -5
- mlrun/utils/http.py +3 -3
- mlrun/utils/logger.py +2 -2
- mlrun/utils/notifications/notification_pusher.py +6 -6
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/METADATA +13 -16
- {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/RECORD +76 -68
- {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/WHEEL +1 -1
- {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
# Copyright 2024 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from typing import Final, Optional, Protocol
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
from pandas import DataFrame, Timestamp
|
|
20
|
+
|
|
21
|
+
from mlrun.common.schemas.model_monitoring.constants import (
|
|
22
|
+
MLRUN_HISTOGRAM_DATA_DRIFT_APP_NAME,
|
|
23
|
+
ResultKindApp,
|
|
24
|
+
ResultStatusApp,
|
|
25
|
+
)
|
|
26
|
+
from mlrun.model_monitoring.application import (
|
|
27
|
+
ModelMonitoringApplicationBase,
|
|
28
|
+
ModelMonitoringApplicationResult,
|
|
29
|
+
)
|
|
30
|
+
from mlrun.model_monitoring.batch import (
|
|
31
|
+
HellingerDistance,
|
|
32
|
+
HistogramDistanceMetric,
|
|
33
|
+
KullbackLeiblerDivergence,
|
|
34
|
+
TotalVarianceDistance,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class InvalidMetricValueError(ValueError):
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class InvalidThresholdValueError(ValueError):
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class ValueClassifier(Protocol):
|
|
47
|
+
def value_to_status(self, value: float) -> ResultStatusApp: ...
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class DataDriftClassifier:
|
|
52
|
+
"""
|
|
53
|
+
Classify data drift numeric values into categorical status.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
potential: float = 0.5
|
|
57
|
+
detected: float = 0.7
|
|
58
|
+
|
|
59
|
+
def __post_init__(self) -> None:
|
|
60
|
+
"""Catch erroneous threshold values"""
|
|
61
|
+
if not 0 < self.potential < self.detected < 1:
|
|
62
|
+
raise InvalidThresholdValueError(
|
|
63
|
+
"The provided thresholds do not comply with the rules"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def value_to_status(self, value: float) -> ResultStatusApp:
|
|
67
|
+
"""
|
|
68
|
+
Translate the numeric value into status category.
|
|
69
|
+
|
|
70
|
+
:param value: The numeric value of the data drift metric, between 0 and 1.
|
|
71
|
+
:returns: `ResultStatusApp` according to the classification.
|
|
72
|
+
"""
|
|
73
|
+
if value > 1 or value < 0:
|
|
74
|
+
raise InvalidMetricValueError(
|
|
75
|
+
f"{value = } is invalid, must be in the range [0, 1]."
|
|
76
|
+
)
|
|
77
|
+
if value >= self.detected:
|
|
78
|
+
return ResultStatusApp.detected
|
|
79
|
+
if value >= self.potential:
|
|
80
|
+
return ResultStatusApp.potential_detection
|
|
81
|
+
return ResultStatusApp.no_detection
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
|
|
85
|
+
"""
|
|
86
|
+
MLRun's default data drift application for model monitoring.
|
|
87
|
+
|
|
88
|
+
The application calculates the metrics over the features' histograms.
|
|
89
|
+
Each metric is calculated over all the features, the mean is taken,
|
|
90
|
+
and the status is returned.
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
NAME: Final[str] = MLRUN_HISTOGRAM_DATA_DRIFT_APP_NAME
|
|
94
|
+
METRIC_KIND: Final[ResultKindApp] = ResultKindApp.data_drift
|
|
95
|
+
|
|
96
|
+
_REQUIRED_METRICS = {HellingerDistance, TotalVarianceDistance}
|
|
97
|
+
|
|
98
|
+
metrics: list[type[HistogramDistanceMetric]] = [
|
|
99
|
+
HellingerDistance,
|
|
100
|
+
KullbackLeiblerDivergence,
|
|
101
|
+
TotalVarianceDistance,
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
def __init__(self, value_classifier: Optional[ValueClassifier] = None) -> None:
|
|
105
|
+
"""
|
|
106
|
+
Initialize the data drift application.
|
|
107
|
+
|
|
108
|
+
:param value_classifier: Classifier object that adheres to the `ValueClassifier` protocol.
|
|
109
|
+
If not provided, the default `DataDriftClassifier()` is used.
|
|
110
|
+
"""
|
|
111
|
+
self._value_classifier = value_classifier or DataDriftClassifier()
|
|
112
|
+
assert self._REQUIRED_METRICS <= set(
|
|
113
|
+
self.metrics
|
|
114
|
+
), "TVD and Hellinger distance are required for the general data drift result"
|
|
115
|
+
|
|
116
|
+
def _compute_metrics_per_feature(
|
|
117
|
+
self, sample_df_stats: DataFrame, feature_stats: DataFrame
|
|
118
|
+
) -> dict[type[HistogramDistanceMetric], list[float]]:
|
|
119
|
+
"""Compute the metrics for the different features and labels"""
|
|
120
|
+
metrics_per_feature: dict[type[HistogramDistanceMetric], list[float]] = {
|
|
121
|
+
metric_class: [] for metric_class in self.metrics
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
for (sample_feat, sample_hist), (reference_feat, reference_hist) in zip(
|
|
125
|
+
sample_df_stats.items(), feature_stats.items()
|
|
126
|
+
):
|
|
127
|
+
assert sample_feat == reference_feat, "The features do not match"
|
|
128
|
+
self.context.logger.info(
|
|
129
|
+
"Computing metrics for feature", feature_name=sample_feat
|
|
130
|
+
)
|
|
131
|
+
sample_arr = np.asarray(sample_hist)
|
|
132
|
+
reference_arr = np.asarray(reference_hist)
|
|
133
|
+
for metric in self.metrics:
|
|
134
|
+
metric_name = metric.NAME
|
|
135
|
+
self.context.logger.debug(
|
|
136
|
+
"Computing data drift metric",
|
|
137
|
+
metric_name=metric_name,
|
|
138
|
+
feature_name=sample_feat,
|
|
139
|
+
)
|
|
140
|
+
metrics_per_feature[metric].append(
|
|
141
|
+
metric(distrib_t=sample_arr, distrib_u=reference_arr).compute()
|
|
142
|
+
)
|
|
143
|
+
self.context.logger.info("Finished computing the metrics")
|
|
144
|
+
|
|
145
|
+
return metrics_per_feature
|
|
146
|
+
|
|
147
|
+
def _add_general_drift_result(
|
|
148
|
+
self, results: list[ModelMonitoringApplicationResult], value: float
|
|
149
|
+
) -> None:
|
|
150
|
+
results.append(
|
|
151
|
+
ModelMonitoringApplicationResult(
|
|
152
|
+
name="general_drift",
|
|
153
|
+
value=value,
|
|
154
|
+
kind=self.METRIC_KIND,
|
|
155
|
+
status=self._value_classifier.value_to_status(value),
|
|
156
|
+
)
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def _get_results(
|
|
160
|
+
self, metrics_per_feature: dict[type[HistogramDistanceMetric], list[float]]
|
|
161
|
+
) -> list[ModelMonitoringApplicationResult]:
|
|
162
|
+
"""Average the metrics over the features and add the status"""
|
|
163
|
+
results: list[ModelMonitoringApplicationResult] = []
|
|
164
|
+
hellinger_tvd_values: list[float] = []
|
|
165
|
+
for metric_class, metric_values in metrics_per_feature.items():
|
|
166
|
+
self.context.logger.debug(
|
|
167
|
+
"Averaging metric over the features", metric_name=metric_class.NAME
|
|
168
|
+
)
|
|
169
|
+
value = np.mean(metric_values)
|
|
170
|
+
if metric_class == KullbackLeiblerDivergence:
|
|
171
|
+
# This metric is not bounded from above [0, inf).
|
|
172
|
+
# No status is currently reported for KL divergence
|
|
173
|
+
status = ResultStatusApp.irrelevant
|
|
174
|
+
else:
|
|
175
|
+
status = self._value_classifier.value_to_status(value)
|
|
176
|
+
if metric_class in self._REQUIRED_METRICS:
|
|
177
|
+
hellinger_tvd_values.append(value)
|
|
178
|
+
results.append(
|
|
179
|
+
ModelMonitoringApplicationResult(
|
|
180
|
+
name=f"{metric_class.NAME}_mean",
|
|
181
|
+
value=value,
|
|
182
|
+
kind=self.METRIC_KIND,
|
|
183
|
+
status=status,
|
|
184
|
+
)
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
self._add_general_drift_result(
|
|
188
|
+
results=results, value=np.mean(hellinger_tvd_values)
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
return results
|
|
192
|
+
|
|
193
|
+
def do_tracking(
|
|
194
|
+
self,
|
|
195
|
+
application_name: str,
|
|
196
|
+
sample_df_stats: DataFrame,
|
|
197
|
+
feature_stats: DataFrame,
|
|
198
|
+
sample_df: DataFrame,
|
|
199
|
+
start_infer_time: Timestamp,
|
|
200
|
+
end_infer_time: Timestamp,
|
|
201
|
+
latest_request: Timestamp,
|
|
202
|
+
endpoint_id: str,
|
|
203
|
+
output_stream_uri: str,
|
|
204
|
+
) -> list[ModelMonitoringApplicationResult]:
|
|
205
|
+
"""
|
|
206
|
+
Calculate and return the data drift metrics, averaged over the features.
|
|
207
|
+
|
|
208
|
+
Refer to `ModelMonitoringApplicationBase` for the meaning of the
|
|
209
|
+
function arguments.
|
|
210
|
+
"""
|
|
211
|
+
self.context.logger.debug("Starting to run the application")
|
|
212
|
+
metrics_per_feature = self._compute_metrics_per_feature(
|
|
213
|
+
sample_df_stats=sample_df_stats, feature_stats=feature_stats
|
|
214
|
+
)
|
|
215
|
+
self.context.logger.debug("Computing average per metric")
|
|
216
|
+
results = self._get_results(metrics_per_feature)
|
|
217
|
+
self.context.logger.debug("Finished running the application", results=results)
|
|
218
|
+
return results
|
mlrun/model_monitoring/batch.py
CHANGED
|
@@ -11,15 +11,13 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
import abc
|
|
14
|
+
|
|
16
15
|
import collections
|
|
17
|
-
import dataclasses
|
|
18
16
|
import datetime
|
|
19
17
|
import json
|
|
20
18
|
import os
|
|
21
19
|
import re
|
|
22
|
-
from typing import Any,
|
|
20
|
+
from typing import Any, Optional, Union
|
|
23
21
|
|
|
24
22
|
import numpy as np
|
|
25
23
|
import pandas as pd
|
|
@@ -35,118 +33,18 @@ import mlrun.common.schemas.model_monitoring
|
|
|
35
33
|
import mlrun.data_types.infer
|
|
36
34
|
import mlrun.feature_store as fstore
|
|
37
35
|
import mlrun.utils.v3io_clients
|
|
36
|
+
from mlrun.model_monitoring.metrics.histogram_distance import (
|
|
37
|
+
HellingerDistance,
|
|
38
|
+
HistogramDistanceMetric,
|
|
39
|
+
KullbackLeiblerDivergence,
|
|
40
|
+
TotalVarianceDistance,
|
|
41
|
+
)
|
|
38
42
|
from mlrun.utils import logger
|
|
39
43
|
|
|
40
44
|
# A type for representing a drift result, a tuple of the status and the drift mean:
|
|
41
45
|
DriftResultType = tuple[mlrun.common.schemas.model_monitoring.DriftStatus, float]
|
|
42
46
|
|
|
43
47
|
|
|
44
|
-
@dataclasses.dataclass
|
|
45
|
-
class HistogramDistanceMetric(abc.ABC):
|
|
46
|
-
"""
|
|
47
|
-
An abstract base class for distance metrics between histograms.
|
|
48
|
-
|
|
49
|
-
:args distrib_t: array of distribution t (usually the latest dataset distribution)
|
|
50
|
-
:args distrib_u: array of distribution u (usually the sample dataset distribution)
|
|
51
|
-
|
|
52
|
-
Each distribution must contain nonnegative floats that sum up to 1.0.
|
|
53
|
-
"""
|
|
54
|
-
|
|
55
|
-
distrib_t: np.ndarray
|
|
56
|
-
distrib_u: np.ndarray
|
|
57
|
-
|
|
58
|
-
NAME: ClassVar[str]
|
|
59
|
-
|
|
60
|
-
# noinspection PyMethodOverriding
|
|
61
|
-
def __init_subclass__(cls, *, metric_name: str, **kwargs) -> None:
|
|
62
|
-
super().__init_subclass__(**kwargs)
|
|
63
|
-
cls.NAME = metric_name
|
|
64
|
-
|
|
65
|
-
@abc.abstractmethod
|
|
66
|
-
def compute(self) -> float:
|
|
67
|
-
raise NotImplementedError
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
class TotalVarianceDistance(HistogramDistanceMetric, metric_name="tvd"):
|
|
71
|
-
"""
|
|
72
|
-
Provides a symmetric drift distance between two periods t and u
|
|
73
|
-
Z - vector of random variables
|
|
74
|
-
Pt - Probability distribution over time span t
|
|
75
|
-
"""
|
|
76
|
-
|
|
77
|
-
def compute(self) -> float:
|
|
78
|
-
"""
|
|
79
|
-
Calculate Total Variance distance.
|
|
80
|
-
|
|
81
|
-
:returns: Total Variance Distance.
|
|
82
|
-
"""
|
|
83
|
-
return np.sum(np.abs(self.distrib_t - self.distrib_u)) / 2
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
class HellingerDistance(HistogramDistanceMetric, metric_name="hellinger"):
|
|
87
|
-
"""
|
|
88
|
-
Hellinger distance is an f divergence measure, similar to the Kullback-Leibler (KL) divergence.
|
|
89
|
-
It used to quantify the difference between two probability distributions.
|
|
90
|
-
However, unlike KL Divergence the Hellinger divergence is symmetric and bounded over a probability space.
|
|
91
|
-
The output range of Hellinger distance is [0,1]. The closer to 0, the more similar the two distributions.
|
|
92
|
-
"""
|
|
93
|
-
|
|
94
|
-
def compute(self) -> float:
|
|
95
|
-
"""
|
|
96
|
-
Calculate Hellinger Distance
|
|
97
|
-
|
|
98
|
-
:returns: Hellinger Distance
|
|
99
|
-
"""
|
|
100
|
-
return np.sqrt(
|
|
101
|
-
max(
|
|
102
|
-
1 - np.sum(np.sqrt(self.distrib_u * self.distrib_t)),
|
|
103
|
-
0, # numerical errors may produce small negative numbers, e.g. -1e-16.
|
|
104
|
-
# However, Cauchy-Schwarz inequality assures this number is in the range [0, 1]
|
|
105
|
-
)
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
class KullbackLeiblerDivergence(HistogramDistanceMetric, metric_name="kld"):
|
|
110
|
-
"""
|
|
111
|
-
KL Divergence (or relative entropy) is a measure of how one probability distribution differs from another.
|
|
112
|
-
It is an asymmetric measure (thus it's not a metric) and it doesn't satisfy the triangle inequality.
|
|
113
|
-
KL Divergence of 0, indicates two identical distributions.
|
|
114
|
-
"""
|
|
115
|
-
|
|
116
|
-
@staticmethod
|
|
117
|
-
def _calc_kl_div(
|
|
118
|
-
actual_dist: np.array, expected_dist: np.array, kld_scaling: float
|
|
119
|
-
) -> float:
|
|
120
|
-
"""Return the asymmetric KL divergence"""
|
|
121
|
-
# We take 0*log(0) == 0 for this calculation
|
|
122
|
-
mask = actual_dist != 0
|
|
123
|
-
actual_dist = actual_dist[mask]
|
|
124
|
-
expected_dist = expected_dist[mask]
|
|
125
|
-
return np.sum(
|
|
126
|
-
actual_dist
|
|
127
|
-
* np.log(
|
|
128
|
-
actual_dist / np.where(expected_dist != 0, expected_dist, kld_scaling)
|
|
129
|
-
),
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
def compute(
|
|
133
|
-
self, capping: Optional[float] = None, kld_scaling: float = 1e-4
|
|
134
|
-
) -> float:
|
|
135
|
-
"""
|
|
136
|
-
:param capping: A bounded value for the KL Divergence. For infinite distance, the result is replaced with
|
|
137
|
-
the capping value which indicates a huge differences between the distributions.
|
|
138
|
-
:param kld_scaling: Will be used to replace 0 values for executing the logarithmic operation.
|
|
139
|
-
|
|
140
|
-
:returns: symmetric KL Divergence
|
|
141
|
-
"""
|
|
142
|
-
t_u = self._calc_kl_div(self.distrib_t, self.distrib_u, kld_scaling)
|
|
143
|
-
u_t = self._calc_kl_div(self.distrib_u, self.distrib_t, kld_scaling)
|
|
144
|
-
result = t_u + u_t
|
|
145
|
-
if capping and result == float("inf"):
|
|
146
|
-
return capping
|
|
147
|
-
return result
|
|
148
|
-
|
|
149
|
-
|
|
150
48
|
class VirtualDrift:
|
|
151
49
|
"""
|
|
152
50
|
Virtual Drift object is used for handling the drift calculations.
|
|
@@ -992,7 +890,7 @@ class BatchProcessor:
|
|
|
992
890
|
"""
|
|
993
891
|
stream_http_path = (
|
|
994
892
|
mlrun.mlconf.model_endpoint_monitoring.default_http_sink.format(
|
|
995
|
-
project=self.project
|
|
893
|
+
project=self.project, namespace=mlrun.mlconf.namespace
|
|
996
894
|
)
|
|
997
895
|
)
|
|
998
896
|
|
|
@@ -20,6 +20,7 @@ import re
|
|
|
20
20
|
from collections.abc import Iterator
|
|
21
21
|
from typing import Any, NamedTuple, Optional, Union, cast
|
|
22
22
|
|
|
23
|
+
import nuclio
|
|
23
24
|
from v3io.dataplane.response import HttpResponseError
|
|
24
25
|
|
|
25
26
|
import mlrun
|
|
@@ -29,6 +30,7 @@ import mlrun.feature_store as fstore
|
|
|
29
30
|
from mlrun.common.model_monitoring.helpers import FeatureStats, pad_features_hist
|
|
30
31
|
from mlrun.datastore import get_stream_pusher
|
|
31
32
|
from mlrun.datastore.targets import ParquetTarget
|
|
33
|
+
from mlrun.errors import err_to_str
|
|
32
34
|
from mlrun.model_monitoring.batch import calculate_inputs_statistics
|
|
33
35
|
from mlrun.model_monitoring.helpers import (
|
|
34
36
|
_BatchDict,
|
|
@@ -282,33 +284,33 @@ class MonitoringApplicationController:
|
|
|
282
284
|
|
|
283
285
|
def __init__(
|
|
284
286
|
self,
|
|
285
|
-
|
|
287
|
+
mlrun_context: mlrun.run.MLClientCtx,
|
|
286
288
|
project: str,
|
|
287
289
|
):
|
|
288
290
|
"""
|
|
289
291
|
Initialize Monitoring Application Processor object.
|
|
290
292
|
|
|
291
|
-
:param
|
|
293
|
+
:param mlrun_context: An MLRun context.
|
|
292
294
|
:param project: Project name.
|
|
293
295
|
"""
|
|
294
|
-
self.context =
|
|
296
|
+
self.context = mlrun_context
|
|
295
297
|
self.project = project
|
|
296
298
|
self.project_obj = mlrun.get_or_create_project(project)
|
|
297
299
|
|
|
298
|
-
|
|
300
|
+
mlrun_context.logger.debug(
|
|
301
|
+
f"Initializing {self.__class__.__name__}", project=project
|
|
302
|
+
)
|
|
299
303
|
|
|
300
304
|
self.db = mlrun.model_monitoring.get_model_endpoint_store(project=project)
|
|
301
305
|
|
|
302
306
|
self._batch_window_generator = _BatchWindowGenerator(
|
|
303
|
-
batch_dict=
|
|
304
|
-
|
|
305
|
-
|
|
307
|
+
batch_dict=json.loads(
|
|
308
|
+
mlrun.get_secret_or_env(
|
|
309
|
+
mm_constants.EventFieldType.BATCH_INTERVALS_DICT
|
|
310
|
+
)
|
|
311
|
+
)
|
|
306
312
|
)
|
|
307
313
|
|
|
308
|
-
# If provided, only model endpoints in that that list will be analyzed
|
|
309
|
-
self.model_endpoints = context.parameters.get(
|
|
310
|
-
mm_constants.EventFieldType.MODEL_ENDPOINTS, None
|
|
311
|
-
)
|
|
312
314
|
self.model_monitoring_access_key = self._get_model_monitoring_access_key()
|
|
313
315
|
self.parquet_directory = get_monitoring_parquet_path(
|
|
314
316
|
self.project_obj,
|
|
@@ -335,66 +337,82 @@ class MonitoringApplicationController:
|
|
|
335
337
|
v3io_access_key=self.model_monitoring_access_key, v3io_api=self.v3io_api
|
|
336
338
|
)
|
|
337
339
|
|
|
338
|
-
def run(self):
|
|
340
|
+
def run(self, event: nuclio.Event):
|
|
339
341
|
"""
|
|
340
342
|
Main method for run all the relevant monitoring applications on each endpoint
|
|
343
|
+
|
|
344
|
+
:param event: trigger event
|
|
341
345
|
"""
|
|
346
|
+
logger.info("Start running monitoring controller")
|
|
342
347
|
try:
|
|
343
|
-
|
|
348
|
+
applications_names = []
|
|
349
|
+
endpoints = self.db.list_model_endpoints()
|
|
350
|
+
if not endpoints:
|
|
351
|
+
self.context.logger.info(
|
|
352
|
+
"No model endpoints found", project=self.project
|
|
353
|
+
)
|
|
354
|
+
return
|
|
344
355
|
monitoring_functions = self.project_obj.list_model_monitoring_functions()
|
|
345
356
|
if monitoring_functions:
|
|
357
|
+
# Gets only application in ready state
|
|
346
358
|
applications_names = list(
|
|
347
|
-
{
|
|
359
|
+
{
|
|
360
|
+
app.metadata.name
|
|
361
|
+
for app in monitoring_functions
|
|
362
|
+
if app.status.state == "ready"
|
|
363
|
+
}
|
|
348
364
|
)
|
|
349
|
-
|
|
365
|
+
if not applications_names:
|
|
350
366
|
self.context.logger.info(
|
|
351
367
|
"No monitoring functions found", project=self.project
|
|
352
368
|
)
|
|
353
|
-
|
|
369
|
+
return
|
|
354
370
|
|
|
355
371
|
except Exception as e:
|
|
356
|
-
self.context.logger.error(
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
# Initialize a process pool that will be used to run each endpoint applications on a dedicated process
|
|
360
|
-
pool = concurrent.futures.ProcessPoolExecutor(
|
|
361
|
-
max_workers=min(len(endpoints), 10),
|
|
372
|
+
self.context.logger.error(
|
|
373
|
+
"Failed to list endpoints and monitoring applications",
|
|
374
|
+
exc=err_to_str(e),
|
|
362
375
|
)
|
|
363
|
-
|
|
364
|
-
|
|
376
|
+
return
|
|
377
|
+
# Initialize a process pool that will be used to run each endpoint applications on a dedicated process
|
|
378
|
+
pool = concurrent.futures.ProcessPoolExecutor(
|
|
379
|
+
max_workers=min(len(endpoints), 10),
|
|
380
|
+
)
|
|
381
|
+
futures = []
|
|
382
|
+
for endpoint in endpoints:
|
|
383
|
+
if (
|
|
384
|
+
endpoint[mm_constants.EventFieldType.ACTIVE]
|
|
385
|
+
and endpoint[mm_constants.EventFieldType.MONITORING_MODE]
|
|
386
|
+
== mm_constants.ModelMonitoringMode.enabled.value
|
|
387
|
+
):
|
|
388
|
+
# Skip router endpoint:
|
|
365
389
|
if (
|
|
366
|
-
endpoint[mm_constants.EventFieldType.
|
|
367
|
-
|
|
368
|
-
== mm_constants.ModelMonitoringMode.enabled.value
|
|
390
|
+
int(endpoint[mm_constants.EventFieldType.ENDPOINT_TYPE])
|
|
391
|
+
== mm_constants.EndpointType.ROUTER
|
|
369
392
|
):
|
|
370
|
-
#
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
== mm_constants.EndpointType.ROUTER
|
|
374
|
-
):
|
|
375
|
-
# Router endpoint has no feature stats
|
|
376
|
-
logger.info(
|
|
377
|
-
f"{endpoint[mm_constants.EventFieldType.UID]} is router skipping"
|
|
378
|
-
)
|
|
379
|
-
continue
|
|
380
|
-
future = pool.submit(
|
|
381
|
-
MonitoringApplicationController.model_endpoint_process,
|
|
382
|
-
endpoint=endpoint,
|
|
383
|
-
applications_names=applications_names,
|
|
384
|
-
batch_window_generator=self._batch_window_generator,
|
|
385
|
-
project=self.project,
|
|
386
|
-
parquet_directory=self.parquet_directory,
|
|
387
|
-
storage_options=self.storage_options,
|
|
388
|
-
model_monitoring_access_key=self.model_monitoring_access_key,
|
|
393
|
+
# Router endpoint has no feature stats
|
|
394
|
+
logger.info(
|
|
395
|
+
f"{endpoint[mm_constants.EventFieldType.UID]} is router skipping"
|
|
389
396
|
)
|
|
390
|
-
|
|
397
|
+
continue
|
|
398
|
+
future = pool.submit(
|
|
399
|
+
MonitoringApplicationController.model_endpoint_process,
|
|
400
|
+
endpoint=endpoint,
|
|
401
|
+
applications_names=applications_names,
|
|
402
|
+
batch_window_generator=self._batch_window_generator,
|
|
403
|
+
project=self.project,
|
|
404
|
+
parquet_directory=self.parquet_directory,
|
|
405
|
+
storage_options=self.storage_options,
|
|
406
|
+
model_monitoring_access_key=self.model_monitoring_access_key,
|
|
407
|
+
)
|
|
408
|
+
futures.append(future)
|
|
391
409
|
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
410
|
+
for future in concurrent.futures.as_completed(futures):
|
|
411
|
+
result = future.result()
|
|
412
|
+
if result:
|
|
413
|
+
self.context.log_results(result)
|
|
396
414
|
|
|
397
|
-
|
|
415
|
+
self._delete_old_parquet(endpoints=endpoints)
|
|
398
416
|
|
|
399
417
|
@classmethod
|
|
400
418
|
def model_endpoint_process(
|
|
@@ -525,7 +543,7 @@ class MonitoringApplicationController:
|
|
|
525
543
|
"""
|
|
526
544
|
if self.parquet_directory.startswith("v3io:///"):
|
|
527
545
|
# create fs with access to the user side (under projects)
|
|
528
|
-
store, _ = mlrun.store_manager.get_or_create_store(
|
|
546
|
+
store, _, _ = mlrun.store_manager.get_or_create_store(
|
|
529
547
|
self.parquet_directory,
|
|
530
548
|
{"V3IO_ACCESS_KEY": self.model_monitoring_access_key},
|
|
531
549
|
)
|
|
@@ -601,12 +619,12 @@ class MonitoringApplicationController:
|
|
|
601
619
|
mm_constants.ApplicationEvent.ENDPOINT_ID: endpoint_id,
|
|
602
620
|
mm_constants.ApplicationEvent.OUTPUT_STREAM_URI: get_stream_path(
|
|
603
621
|
project=project,
|
|
604
|
-
|
|
622
|
+
function_name=mm_constants.MonitoringFunctionNames.WRITER,
|
|
605
623
|
),
|
|
606
624
|
}
|
|
607
625
|
for app_name in applications_names:
|
|
608
626
|
data.update({mm_constants.ApplicationEvent.APPLICATION_NAME: app_name})
|
|
609
|
-
stream_uri = get_stream_path(project=project,
|
|
627
|
+
stream_uri = get_stream_path(project=project, function_name=app_name)
|
|
610
628
|
|
|
611
629
|
logger.info(
|
|
612
630
|
f"push endpoint_id {endpoint_id} to {app_name} by stream :{stream_uri}"
|
|
@@ -11,19 +11,27 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
import nuclio
|
|
14
15
|
|
|
15
16
|
import mlrun
|
|
16
17
|
from mlrun.model_monitoring.controller import MonitoringApplicationController
|
|
17
18
|
|
|
18
19
|
|
|
19
|
-
def handler(context:
|
|
20
|
+
def handler(context: nuclio.Context, event: nuclio.Event) -> None:
|
|
20
21
|
"""
|
|
21
22
|
Run model monitoring application processor
|
|
22
23
|
|
|
23
|
-
:param context: the
|
|
24
|
+
:param context: the Nuclio context
|
|
25
|
+
:param event: trigger event
|
|
24
26
|
"""
|
|
27
|
+
context.user_data.monitor_app_controller.run(event)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def init_context(context):
|
|
31
|
+
mlrun_context = mlrun.get_or_create_ctx("model_monitoring_controller")
|
|
32
|
+
mlrun_context.logger.info("Initialize monitoring app controller")
|
|
25
33
|
monitor_app_controller = MonitoringApplicationController(
|
|
26
|
-
|
|
27
|
-
project=
|
|
34
|
+
mlrun_context=mlrun_context,
|
|
35
|
+
project=mlrun_context.project,
|
|
28
36
|
)
|
|
29
|
-
monitor_app_controller
|
|
37
|
+
setattr(context.user_data, "monitor_app_controller", monitor_app_controller)
|