mlrun 1.7.0rc4__py3-none-any.whl → 1.7.0rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/artifacts/base.py +2 -1
- mlrun/artifacts/plots.py +9 -5
- mlrun/common/constants.py +1 -0
- mlrun/common/schemas/__init__.py +10 -0
- mlrun/common/schemas/api_gateway.py +85 -0
- mlrun/common/schemas/auth.py +2 -2
- mlrun/config.py +19 -4
- mlrun/datastore/sources.py +5 -4
- mlrun/datastore/targets.py +16 -20
- mlrun/db/base.py +16 -0
- mlrun/db/factory.py +1 -1
- mlrun/db/httpdb.py +50 -8
- mlrun/db/nopdb.py +13 -0
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +1 -1
- mlrun/launcher/client.py +1 -1
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +1 -1
- mlrun/launcher/remote.py +1 -1
- mlrun/model_monitoring/api.py +6 -12
- mlrun/model_monitoring/application.py +21 -21
- mlrun/model_monitoring/applications/histogram_data_drift.py +130 -40
- mlrun/model_monitoring/batch.py +1 -42
- mlrun/model_monitoring/controller.py +1 -8
- mlrun/model_monitoring/features_drift_table.py +34 -22
- mlrun/model_monitoring/helpers.py +45 -4
- mlrun/model_monitoring/stream_processing.py +2 -0
- mlrun/projects/project.py +229 -16
- mlrun/run.py +70 -74
- mlrun/runtimes/__init__.py +35 -0
- mlrun/runtimes/base.py +15 -11
- mlrun/runtimes/nuclio/__init__.py +1 -0
- mlrun/runtimes/nuclio/api_gateway.py +300 -0
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +283 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +87 -0
- mlrun/runtimes/nuclio/function.py +50 -1
- mlrun/runtimes/pod.py +1 -1
- mlrun/serving/states.py +7 -19
- mlrun/utils/logger.py +2 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc6.dist-info}/METADATA +1 -1
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc6.dist-info}/RECORD +47 -42
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc6.dist-info}/WHEEL +1 -1
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc6.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc6.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc6.dist-info}/top_level.txt +0 -0
|
@@ -16,13 +16,13 @@ import dataclasses
|
|
|
16
16
|
import json
|
|
17
17
|
import re
|
|
18
18
|
from abc import ABC, abstractmethod
|
|
19
|
-
from typing import Any, Optional, Union
|
|
19
|
+
from typing import Any, Optional, Union, cast
|
|
20
20
|
|
|
21
21
|
import numpy as np
|
|
22
22
|
import pandas as pd
|
|
23
23
|
|
|
24
24
|
import mlrun.common.helpers
|
|
25
|
-
import mlrun.common.
|
|
25
|
+
import mlrun.common.model_monitoring.helpers
|
|
26
26
|
import mlrun.common.schemas.model_monitoring.constants as mm_constant
|
|
27
27
|
import mlrun.utils.v3io_clients
|
|
28
28
|
from mlrun.datastore import get_stream_pusher
|
|
@@ -84,8 +84,8 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
|
|
|
84
84
|
class MyApp(ApplicationBase):
|
|
85
85
|
def do_tracking(
|
|
86
86
|
self,
|
|
87
|
-
sample_df_stats:
|
|
88
|
-
feature_stats:
|
|
87
|
+
sample_df_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
|
|
88
|
+
feature_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
|
|
89
89
|
start_infer_time: pd.Timestamp,
|
|
90
90
|
end_infer_time: pd.Timestamp,
|
|
91
91
|
schedule_time: pd.Timestamp,
|
|
@@ -93,7 +93,7 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
|
|
|
93
93
|
endpoint_id: str,
|
|
94
94
|
output_stream_uri: str,
|
|
95
95
|
) -> ModelMonitoringApplicationResult:
|
|
96
|
-
self.context.log_artifact(TableArtifact("sample_df_stats", df=sample_df_stats))
|
|
96
|
+
self.context.log_artifact(TableArtifact("sample_df_stats", df=self.dict_to_histogram(sample_df_stats)))
|
|
97
97
|
return ModelMonitoringApplicationResult(
|
|
98
98
|
name="data_drift_test",
|
|
99
99
|
value=0.5,
|
|
@@ -126,14 +126,16 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
|
|
|
126
126
|
return results, event
|
|
127
127
|
|
|
128
128
|
def _lazy_init(self, app_name: str):
|
|
129
|
-
self.context =
|
|
129
|
+
self.context = cast(
|
|
130
|
+
mlrun.MLClientCtx, self._create_context_for_logging(app_name=app_name)
|
|
131
|
+
)
|
|
130
132
|
|
|
131
133
|
@abstractmethod
|
|
132
134
|
def do_tracking(
|
|
133
135
|
self,
|
|
134
136
|
application_name: str,
|
|
135
|
-
sample_df_stats:
|
|
136
|
-
feature_stats:
|
|
137
|
+
sample_df_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
|
|
138
|
+
feature_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
|
|
137
139
|
sample_df: pd.DataFrame,
|
|
138
140
|
start_infer_time: pd.Timestamp,
|
|
139
141
|
end_infer_time: pd.Timestamp,
|
|
@@ -147,8 +149,8 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
|
|
|
147
149
|
Implement this method with your custom monitoring logic.
|
|
148
150
|
|
|
149
151
|
:param application_name: (str) the app name
|
|
150
|
-
:param sample_df_stats: (
|
|
151
|
-
:param feature_stats: (
|
|
152
|
+
:param sample_df_stats: (FeatureStats) The new sample distribution dictionary.
|
|
153
|
+
:param feature_stats: (FeatureStats) The train sample distribution dictionary.
|
|
152
154
|
:param sample_df: (pd.DataFrame) The new sample DataFrame.
|
|
153
155
|
:param start_infer_time: (pd.Timestamp) Start time of the monitoring schedule.
|
|
154
156
|
:param end_infer_time: (pd.Timestamp) End time of the monitoring schedule.
|
|
@@ -167,8 +169,8 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
|
|
|
167
169
|
event: dict[str, Any],
|
|
168
170
|
) -> tuple[
|
|
169
171
|
str,
|
|
170
|
-
|
|
171
|
-
|
|
172
|
+
mlrun.common.model_monitoring.helpers.FeatureStats,
|
|
173
|
+
mlrun.common.model_monitoring.helpers.FeatureStats,
|
|
172
174
|
pd.DataFrame,
|
|
173
175
|
pd.Timestamp,
|
|
174
176
|
pd.Timestamp,
|
|
@@ -184,8 +186,8 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
|
|
|
184
186
|
|
|
185
187
|
:return: A tuple of:
|
|
186
188
|
[0] = (str) application name
|
|
187
|
-
[1] = (
|
|
188
|
-
[2] = (
|
|
189
|
+
[1] = (dict) current input statistics
|
|
190
|
+
[2] = (dict) train statistics
|
|
189
191
|
[3] = (pd.DataFrame) current input data
|
|
190
192
|
[4] = (pd.Timestamp) start time of the monitoring schedule
|
|
191
193
|
[5] = (pd.Timestamp) end time of the monitoring schedule
|
|
@@ -197,12 +199,8 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
|
|
|
197
199
|
end_time = pd.Timestamp(event[mm_constant.ApplicationEvent.END_INFER_TIME])
|
|
198
200
|
return (
|
|
199
201
|
event[mm_constant.ApplicationEvent.APPLICATION_NAME],
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
),
|
|
203
|
-
cls._dict_to_histogram(
|
|
204
|
-
json.loads(event[mm_constant.ApplicationEvent.FEATURE_STATS])
|
|
205
|
-
),
|
|
202
|
+
json.loads(event[mm_constant.ApplicationEvent.CURRENT_STATS]),
|
|
203
|
+
json.loads(event[mm_constant.ApplicationEvent.FEATURE_STATS]),
|
|
206
204
|
ParquetTarget(
|
|
207
205
|
path=event[mm_constant.ApplicationEvent.SAMPLE_PARQUET_PATH]
|
|
208
206
|
).as_df(start_time=start_time, end_time=end_time, time_column="timestamp"),
|
|
@@ -223,7 +221,9 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
|
|
|
223
221
|
return context
|
|
224
222
|
|
|
225
223
|
@staticmethod
|
|
226
|
-
def
|
|
224
|
+
def dict_to_histogram(
|
|
225
|
+
histogram_dict: mlrun.common.model_monitoring.helpers.FeatureStats,
|
|
226
|
+
) -> pd.DataFrame:
|
|
227
227
|
"""
|
|
228
228
|
Convert histogram dictionary to pandas DataFrame with feature histograms as columns
|
|
229
229
|
|
|
@@ -13,13 +13,17 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
from dataclasses import dataclass
|
|
16
|
-
from typing import Final, Optional, Protocol
|
|
16
|
+
from typing import Final, Optional, Protocol, cast
|
|
17
17
|
|
|
18
18
|
import numpy as np
|
|
19
|
-
from pandas import DataFrame, Timestamp
|
|
19
|
+
from pandas import DataFrame, Series, Timestamp
|
|
20
20
|
|
|
21
|
+
import mlrun.artifacts
|
|
22
|
+
import mlrun.common.model_monitoring.helpers
|
|
23
|
+
import mlrun.model_monitoring.features_drift_table as mm_drift_table
|
|
21
24
|
from mlrun.common.schemas.model_monitoring.constants import (
|
|
22
25
|
MLRUN_HISTOGRAM_DATA_DRIFT_APP_NAME,
|
|
26
|
+
EventFieldType,
|
|
23
27
|
ResultKindApp,
|
|
24
28
|
ResultStatusApp,
|
|
25
29
|
)
|
|
@@ -27,7 +31,7 @@ from mlrun.model_monitoring.application import (
|
|
|
27
31
|
ModelMonitoringApplicationBase,
|
|
28
32
|
ModelMonitoringApplicationResult,
|
|
29
33
|
)
|
|
30
|
-
from mlrun.model_monitoring.
|
|
34
|
+
from mlrun.model_monitoring.metrics.histogram_distance import (
|
|
31
35
|
HellingerDistance,
|
|
32
36
|
HistogramDistanceMetric,
|
|
33
37
|
KullbackLeiblerDivergence,
|
|
@@ -115,31 +119,24 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
|
|
|
115
119
|
|
|
116
120
|
def _compute_metrics_per_feature(
|
|
117
121
|
self, sample_df_stats: DataFrame, feature_stats: DataFrame
|
|
118
|
-
) ->
|
|
122
|
+
) -> DataFrame:
|
|
119
123
|
"""Compute the metrics for the different features and labels"""
|
|
120
|
-
metrics_per_feature
|
|
121
|
-
metric_class
|
|
122
|
-
|
|
124
|
+
metrics_per_feature = DataFrame(
|
|
125
|
+
columns=[metric_class.NAME for metric_class in self.metrics]
|
|
126
|
+
)
|
|
123
127
|
|
|
124
|
-
for
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
assert sample_feat == reference_feat, "The features do not match"
|
|
128
|
+
for feature_name in feature_stats:
|
|
129
|
+
sample_hist = np.asarray(sample_df_stats[feature_name])
|
|
130
|
+
reference_hist = np.asarray(feature_stats[feature_name])
|
|
128
131
|
self.context.logger.info(
|
|
129
|
-
"Computing metrics for feature", feature_name=
|
|
132
|
+
"Computing metrics for feature", feature_name=feature_name
|
|
130
133
|
)
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
self.
|
|
136
|
-
|
|
137
|
-
metric_name=metric_name,
|
|
138
|
-
feature_name=sample_feat,
|
|
139
|
-
)
|
|
140
|
-
metrics_per_feature[metric].append(
|
|
141
|
-
metric(distrib_t=sample_arr, distrib_u=reference_arr).compute()
|
|
142
|
-
)
|
|
134
|
+
metrics_per_feature.loc[feature_name] = { # pyright: ignore[reportCallIssue,reportArgumentType]
|
|
135
|
+
metric.NAME: metric(
|
|
136
|
+
distrib_t=sample_hist, distrib_u=reference_hist
|
|
137
|
+
).compute()
|
|
138
|
+
for metric in self.metrics
|
|
139
|
+
}
|
|
143
140
|
self.context.logger.info("Finished computing the metrics")
|
|
144
141
|
|
|
145
142
|
return metrics_per_feature
|
|
@@ -147,37 +144,37 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
|
|
|
147
144
|
def _add_general_drift_result(
|
|
148
145
|
self, results: list[ModelMonitoringApplicationResult], value: float
|
|
149
146
|
) -> None:
|
|
147
|
+
"""Add the general drift result to the results list and log it"""
|
|
148
|
+
status = self._value_classifier.value_to_status(value)
|
|
150
149
|
results.append(
|
|
151
150
|
ModelMonitoringApplicationResult(
|
|
152
151
|
name="general_drift",
|
|
153
152
|
value=value,
|
|
154
153
|
kind=self.METRIC_KIND,
|
|
155
|
-
status=
|
|
154
|
+
status=status,
|
|
156
155
|
)
|
|
157
156
|
)
|
|
158
157
|
|
|
159
158
|
def _get_results(
|
|
160
|
-
self, metrics_per_feature:
|
|
159
|
+
self, metrics_per_feature: DataFrame
|
|
161
160
|
) -> list[ModelMonitoringApplicationResult]:
|
|
162
161
|
"""Average the metrics over the features and add the status"""
|
|
163
162
|
results: list[ModelMonitoringApplicationResult] = []
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
if
|
|
163
|
+
|
|
164
|
+
self.context.logger.debug("Averaging metrics over the features")
|
|
165
|
+
metrics_mean = metrics_per_feature.mean().to_dict()
|
|
166
|
+
|
|
167
|
+
self.context.logger.debug("Creating the results")
|
|
168
|
+
for name, value in metrics_mean.items():
|
|
169
|
+
if name == KullbackLeiblerDivergence.NAME:
|
|
171
170
|
# This metric is not bounded from above [0, inf).
|
|
172
171
|
# No status is currently reported for KL divergence
|
|
173
172
|
status = ResultStatusApp.irrelevant
|
|
174
173
|
else:
|
|
175
174
|
status = self._value_classifier.value_to_status(value)
|
|
176
|
-
if metric_class in self._REQUIRED_METRICS:
|
|
177
|
-
hellinger_tvd_values.append(value)
|
|
178
175
|
results.append(
|
|
179
176
|
ModelMonitoringApplicationResult(
|
|
180
|
-
name=f"{
|
|
177
|
+
name=f"{name}_mean",
|
|
181
178
|
value=value,
|
|
182
179
|
kind=self.METRIC_KIND,
|
|
183
180
|
status=status,
|
|
@@ -185,16 +182,102 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
|
|
|
185
182
|
)
|
|
186
183
|
|
|
187
184
|
self._add_general_drift_result(
|
|
188
|
-
results=results,
|
|
185
|
+
results=results,
|
|
186
|
+
value=np.mean(
|
|
187
|
+
[
|
|
188
|
+
metrics_mean[HellingerDistance.NAME],
|
|
189
|
+
metrics_mean[TotalVarianceDistance.NAME],
|
|
190
|
+
]
|
|
191
|
+
),
|
|
189
192
|
)
|
|
190
193
|
|
|
194
|
+
self.context.logger.info("Finished with the results")
|
|
191
195
|
return results
|
|
192
196
|
|
|
197
|
+
@staticmethod
|
|
198
|
+
def _remove_timestamp_feature(
|
|
199
|
+
sample_set_statistics: mlrun.common.model_monitoring.helpers.FeatureStats,
|
|
200
|
+
) -> mlrun.common.model_monitoring.helpers.FeatureStats:
|
|
201
|
+
"""
|
|
202
|
+
Drop the 'timestamp' feature if it exists, as it is irrelevant
|
|
203
|
+
in the plotly artifact
|
|
204
|
+
"""
|
|
205
|
+
sample_set_statistics = mlrun.common.model_monitoring.helpers.FeatureStats(
|
|
206
|
+
sample_set_statistics.copy()
|
|
207
|
+
)
|
|
208
|
+
if EventFieldType.TIMESTAMP in sample_set_statistics:
|
|
209
|
+
del sample_set_statistics[EventFieldType.TIMESTAMP]
|
|
210
|
+
return sample_set_statistics
|
|
211
|
+
|
|
212
|
+
def _log_json_artifact(self, drift_per_feature_values: Series) -> None:
|
|
213
|
+
"""Log the drift values as a JSON artifact"""
|
|
214
|
+
self.context.logger.debug("Logging drift value per feature JSON artifact")
|
|
215
|
+
self.context.log_artifact(
|
|
216
|
+
mlrun.artifacts.Artifact(
|
|
217
|
+
body=drift_per_feature_values.to_json(),
|
|
218
|
+
format="json",
|
|
219
|
+
key="features_drift_results",
|
|
220
|
+
)
|
|
221
|
+
)
|
|
222
|
+
self.context.logger.debug("Logged JSON artifact successfully")
|
|
223
|
+
|
|
224
|
+
def _log_plotly_table_artifact(
|
|
225
|
+
self,
|
|
226
|
+
sample_set_statistics: mlrun.common.model_monitoring.helpers.FeatureStats,
|
|
227
|
+
inputs_statistics: mlrun.common.model_monitoring.helpers.FeatureStats,
|
|
228
|
+
metrics_per_feature: DataFrame,
|
|
229
|
+
drift_per_feature_values: Series,
|
|
230
|
+
) -> None:
|
|
231
|
+
"""Log the Plotly drift table artifact"""
|
|
232
|
+
self.context.logger.debug(
|
|
233
|
+
"Feature stats",
|
|
234
|
+
sample_set_statistics=sample_set_statistics,
|
|
235
|
+
inputs_statistics=inputs_statistics,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
self.context.logger.debug("Computing drift results per feature")
|
|
239
|
+
drift_results = {
|
|
240
|
+
cast(str, key): (self._value_classifier.value_to_status(value), value)
|
|
241
|
+
for key, value in drift_per_feature_values.items()
|
|
242
|
+
}
|
|
243
|
+
self.context.logger.debug("Logging plotly artifact")
|
|
244
|
+
self.context.log_artifact(
|
|
245
|
+
mm_drift_table.FeaturesDriftTablePlot().produce(
|
|
246
|
+
sample_set_statistics=sample_set_statistics,
|
|
247
|
+
inputs_statistics=inputs_statistics,
|
|
248
|
+
metrics=metrics_per_feature.T.to_dict(),
|
|
249
|
+
drift_results=drift_results,
|
|
250
|
+
)
|
|
251
|
+
)
|
|
252
|
+
self.context.logger.debug("Logged plotly artifact successfully")
|
|
253
|
+
|
|
254
|
+
def _log_drift_artifacts(
|
|
255
|
+
self,
|
|
256
|
+
sample_set_statistics: mlrun.common.model_monitoring.helpers.FeatureStats,
|
|
257
|
+
inputs_statistics: mlrun.common.model_monitoring.helpers.FeatureStats,
|
|
258
|
+
metrics_per_feature: DataFrame,
|
|
259
|
+
log_json_artifact: bool = True,
|
|
260
|
+
) -> None:
|
|
261
|
+
"""Log JSON and Plotly drift data per feature artifacts"""
|
|
262
|
+
drift_per_feature_values = metrics_per_feature[
|
|
263
|
+
[HellingerDistance.NAME, TotalVarianceDistance.NAME]
|
|
264
|
+
].mean(axis=1)
|
|
265
|
+
|
|
266
|
+
if log_json_artifact:
|
|
267
|
+
self._log_json_artifact(drift_per_feature_values)
|
|
268
|
+
|
|
269
|
+
self._log_plotly_table_artifact(
|
|
270
|
+
sample_set_statistics=self._remove_timestamp_feature(sample_set_statistics),
|
|
271
|
+
inputs_statistics=inputs_statistics,
|
|
272
|
+
metrics_per_feature=metrics_per_feature,
|
|
273
|
+
drift_per_feature_values=drift_per_feature_values,
|
|
274
|
+
)
|
|
275
|
+
|
|
193
276
|
def do_tracking(
|
|
194
277
|
self,
|
|
195
278
|
application_name: str,
|
|
196
|
-
sample_df_stats:
|
|
197
|
-
feature_stats:
|
|
279
|
+
sample_df_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
|
|
280
|
+
feature_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
|
|
198
281
|
sample_df: DataFrame,
|
|
199
282
|
start_infer_time: Timestamp,
|
|
200
283
|
end_infer_time: Timestamp,
|
|
@@ -210,7 +293,14 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
|
|
|
210
293
|
"""
|
|
211
294
|
self.context.logger.debug("Starting to run the application")
|
|
212
295
|
metrics_per_feature = self._compute_metrics_per_feature(
|
|
213
|
-
sample_df_stats=sample_df_stats,
|
|
296
|
+
sample_df_stats=self.dict_to_histogram(sample_df_stats),
|
|
297
|
+
feature_stats=self.dict_to_histogram(feature_stats),
|
|
298
|
+
)
|
|
299
|
+
self.context.logger.debug("Saving artifacts")
|
|
300
|
+
self._log_drift_artifacts(
|
|
301
|
+
inputs_statistics=feature_stats,
|
|
302
|
+
sample_set_statistics=sample_df_stats,
|
|
303
|
+
metrics_per_feature=metrics_per_feature,
|
|
214
304
|
)
|
|
215
305
|
self.context.logger.debug("Computing average per metric")
|
|
216
306
|
results = self._get_results(metrics_per_feature)
|
mlrun/model_monitoring/batch.py
CHANGED
|
@@ -33,6 +33,7 @@ import mlrun.common.schemas.model_monitoring
|
|
|
33
33
|
import mlrun.data_types.infer
|
|
34
34
|
import mlrun.feature_store as fstore
|
|
35
35
|
import mlrun.utils.v3io_clients
|
|
36
|
+
from mlrun.model_monitoring.helpers import calculate_inputs_statistics
|
|
36
37
|
from mlrun.model_monitoring.metrics.histogram_distance import (
|
|
37
38
|
HellingerDistance,
|
|
38
39
|
HistogramDistanceMetric,
|
|
@@ -353,48 +354,6 @@ class VirtualDrift:
|
|
|
353
354
|
return drift_status
|
|
354
355
|
|
|
355
356
|
|
|
356
|
-
def calculate_inputs_statistics(
|
|
357
|
-
sample_set_statistics: dict, inputs: pd.DataFrame
|
|
358
|
-
) -> dict:
|
|
359
|
-
"""
|
|
360
|
-
Calculate the inputs data statistics for drift monitoring purpose.
|
|
361
|
-
|
|
362
|
-
:param sample_set_statistics: The sample set (stored end point's dataset to reference) statistics. The bins of the
|
|
363
|
-
histograms of each feature will be used to recalculate the histograms of the inputs.
|
|
364
|
-
:param inputs: The inputs to calculate their statistics and later on - the drift with respect to the
|
|
365
|
-
sample set.
|
|
366
|
-
|
|
367
|
-
:returns: The calculated statistics of the inputs data.
|
|
368
|
-
"""
|
|
369
|
-
|
|
370
|
-
# Use `DFDataInfer` to calculate the statistics over the inputs:
|
|
371
|
-
inputs_statistics = mlrun.data_types.infer.DFDataInfer.get_stats(
|
|
372
|
-
df=inputs,
|
|
373
|
-
options=mlrun.data_types.infer.InferOptions.Histogram,
|
|
374
|
-
)
|
|
375
|
-
|
|
376
|
-
# Recalculate the histograms over the bins that are set in the sample-set of the end point:
|
|
377
|
-
for feature in inputs_statistics.keys():
|
|
378
|
-
if feature in sample_set_statistics:
|
|
379
|
-
counts, bins = np.histogram(
|
|
380
|
-
inputs[feature].to_numpy(),
|
|
381
|
-
bins=sample_set_statistics[feature]["hist"][1],
|
|
382
|
-
)
|
|
383
|
-
inputs_statistics[feature]["hist"] = [
|
|
384
|
-
counts.tolist(),
|
|
385
|
-
bins.tolist(),
|
|
386
|
-
]
|
|
387
|
-
elif "hist" in inputs_statistics[feature]:
|
|
388
|
-
# Comply with the other common features' histogram length
|
|
389
|
-
mlrun.common.model_monitoring.helpers.pad_hist(
|
|
390
|
-
mlrun.common.model_monitoring.helpers.Histogram(
|
|
391
|
-
inputs_statistics[feature]["hist"]
|
|
392
|
-
)
|
|
393
|
-
)
|
|
394
|
-
|
|
395
|
-
return inputs_statistics
|
|
396
|
-
|
|
397
|
-
|
|
398
357
|
class BatchProcessor:
|
|
399
358
|
"""
|
|
400
359
|
The main object to handle the batch processing job. This object is used to get the required configurations and
|
|
@@ -31,10 +31,10 @@ from mlrun.common.model_monitoring.helpers import FeatureStats, pad_features_his
|
|
|
31
31
|
from mlrun.datastore import get_stream_pusher
|
|
32
32
|
from mlrun.datastore.targets import ParquetTarget
|
|
33
33
|
from mlrun.errors import err_to_str
|
|
34
|
-
from mlrun.model_monitoring.batch import calculate_inputs_statistics
|
|
35
34
|
from mlrun.model_monitoring.helpers import (
|
|
36
35
|
_BatchDict,
|
|
37
36
|
batch_dict2timedelta,
|
|
37
|
+
calculate_inputs_statistics,
|
|
38
38
|
get_monitoring_parquet_path,
|
|
39
39
|
get_stream_path,
|
|
40
40
|
)
|
|
@@ -445,13 +445,6 @@ class MonitoringApplicationController:
|
|
|
445
445
|
m_fs = fstore.get_feature_set(
|
|
446
446
|
endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
|
|
447
447
|
)
|
|
448
|
-
labels = endpoint[mm_constants.EventFieldType.LABEL_NAMES]
|
|
449
|
-
if labels:
|
|
450
|
-
if isinstance(labels, str):
|
|
451
|
-
labels = json.loads(labels)
|
|
452
|
-
for label in labels:
|
|
453
|
-
if label not in list(m_fs.spec.features.keys()):
|
|
454
|
-
m_fs.add_feature(fstore.Feature(name=label, value_type="float"))
|
|
455
448
|
|
|
456
449
|
for application in applications_names:
|
|
457
450
|
batch_window = batch_window_generator.get_batch_window(
|
|
@@ -21,9 +21,34 @@ import plotly.graph_objects as go
|
|
|
21
21
|
from plotly.subplots import make_subplots
|
|
22
22
|
|
|
23
23
|
import mlrun.common.schemas.model_monitoring
|
|
24
|
+
from mlrun.artifacts import PlotlyArtifact
|
|
24
25
|
|
|
25
26
|
# A type for representing a drift result, a tuple of the status and the drift mean:
|
|
26
|
-
DriftResultType = tuple[
|
|
27
|
+
DriftResultType = tuple[
|
|
28
|
+
mlrun.common.schemas.model_monitoring.constants.ResultStatusApp, float
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class _PlotlyTableArtifact(PlotlyArtifact):
|
|
33
|
+
"""A custom class for plotly table artifacts"""
|
|
34
|
+
|
|
35
|
+
@staticmethod
|
|
36
|
+
def _disable_table_dragging(figure_html: str) -> str:
|
|
37
|
+
"""
|
|
38
|
+
Disable the table columns dragging by adding the following
|
|
39
|
+
JavaScript code
|
|
40
|
+
"""
|
|
41
|
+
start, end = figure_html.rsplit(";", 1)
|
|
42
|
+
middle = (
|
|
43
|
+
';for (const element of document.getElementsByClassName("table")) '
|
|
44
|
+
'{element.style.pointerEvents = "none";}'
|
|
45
|
+
)
|
|
46
|
+
figure_html = start + middle + end
|
|
47
|
+
return figure_html
|
|
48
|
+
|
|
49
|
+
def get_body(self) -> str:
|
|
50
|
+
"""Get the adjusted HTML representation of the figure"""
|
|
51
|
+
return self._disable_table_dragging(super().get_body())
|
|
27
52
|
|
|
28
53
|
|
|
29
54
|
class FeaturesDriftTablePlot:
|
|
@@ -62,9 +87,9 @@ class FeaturesDriftTablePlot:
|
|
|
62
87
|
|
|
63
88
|
# Status configurations:
|
|
64
89
|
_STATUS_COLORS = {
|
|
65
|
-
mlrun.common.schemas.model_monitoring.
|
|
66
|
-
mlrun.common.schemas.model_monitoring.
|
|
67
|
-
mlrun.common.schemas.model_monitoring.
|
|
90
|
+
mlrun.common.schemas.model_monitoring.constants.ResultStatusApp.no_detection: "rgb(0,176,80)", # Green
|
|
91
|
+
mlrun.common.schemas.model_monitoring.constants.ResultStatusApp.potential_detection: "rgb(255,192,0)", # Orange
|
|
92
|
+
mlrun.common.schemas.model_monitoring.constants.ResultStatusApp.detected: "rgb(208,0,106)", # Magenta
|
|
68
93
|
}
|
|
69
94
|
|
|
70
95
|
# Font configurations:
|
|
@@ -97,7 +122,7 @@ class FeaturesDriftTablePlot:
|
|
|
97
122
|
inputs_statistics: dict,
|
|
98
123
|
metrics: dict[str, Union[dict, float]],
|
|
99
124
|
drift_results: dict[str, DriftResultType],
|
|
100
|
-
) ->
|
|
125
|
+
) -> _PlotlyTableArtifact:
|
|
101
126
|
"""
|
|
102
127
|
Produce the html code of the table plot with the given information and the stored configurations in the class.
|
|
103
128
|
|
|
@@ -106,9 +131,8 @@ class FeaturesDriftTablePlot:
|
|
|
106
131
|
:param metrics: The drift detection metrics calculated on the sample set and inputs.
|
|
107
132
|
:param drift_results: The drift results per feature according to the rules of the monitor.
|
|
108
133
|
|
|
109
|
-
:return: The
|
|
134
|
+
:return: The drift table as a plotly artifact.
|
|
110
135
|
"""
|
|
111
|
-
# Plot the drift table:
|
|
112
136
|
figure = self._plot(
|
|
113
137
|
features=list(inputs_statistics.keys()),
|
|
114
138
|
sample_set_statistics=sample_set_statistics,
|
|
@@ -116,19 +140,7 @@ class FeaturesDriftTablePlot:
|
|
|
116
140
|
metrics=metrics,
|
|
117
141
|
drift_results=drift_results,
|
|
118
142
|
)
|
|
119
|
-
|
|
120
|
-
# Get its HTML representation:
|
|
121
|
-
figure_html = figure.to_html()
|
|
122
|
-
|
|
123
|
-
# Turn off the table columns dragging by injecting the following JavaScript code:
|
|
124
|
-
start, end = figure_html.rsplit(";", 1)
|
|
125
|
-
middle = (
|
|
126
|
-
';for (const element of document.getElementsByClassName("table")) '
|
|
127
|
-
'{element.style.pointerEvents = "none";}'
|
|
128
|
-
)
|
|
129
|
-
figure_html = start + middle + end
|
|
130
|
-
|
|
131
|
-
return figure_html
|
|
143
|
+
return _PlotlyTableArtifact(figure=figure, key="drift_table_plot")
|
|
132
144
|
|
|
133
145
|
def _read_columns_names(self, statistics_dictionary: dict, drift_metrics: dict):
|
|
134
146
|
"""
|
|
@@ -366,10 +378,10 @@ class FeaturesDriftTablePlot:
|
|
|
366
378
|
bins = np.array(bins)
|
|
367
379
|
if bins[0] == -sys.float_info.max:
|
|
368
380
|
bins[0] = bins[1] - (bins[2] - bins[1])
|
|
369
|
-
hovertext[0] = f"(
|
|
381
|
+
hovertext[0] = f"(-inf, {bins[1]})"
|
|
370
382
|
if bins[-1] == sys.float_info.max:
|
|
371
383
|
bins[-1] = bins[-2] + (bins[-2] - bins[-3])
|
|
372
|
-
hovertext[-1] = f"({bins[-2]},
|
|
384
|
+
hovertext[-1] = f"({bins[-2]}, inf)"
|
|
373
385
|
# Center the bins (leave the first one):
|
|
374
386
|
bins = 0.5 * (bins[:-1] + bins[1:])
|
|
375
387
|
# Plot the histogram as a line with filled background below it:
|
|
@@ -15,6 +15,9 @@
|
|
|
15
15
|
import datetime
|
|
16
16
|
import typing
|
|
17
17
|
|
|
18
|
+
import numpy as np
|
|
19
|
+
import pandas as pd
|
|
20
|
+
|
|
18
21
|
import mlrun
|
|
19
22
|
import mlrun.common.model_monitoring.helpers
|
|
20
23
|
import mlrun.common.schemas
|
|
@@ -36,10 +39,6 @@ class _BatchDict(typing.TypedDict):
|
|
|
36
39
|
days: int
|
|
37
40
|
|
|
38
41
|
|
|
39
|
-
class _MLRunNoRunsFoundError(Exception):
|
|
40
|
-
pass
|
|
41
|
-
|
|
42
|
-
|
|
43
42
|
def get_stream_path(
|
|
44
43
|
project: str = None,
|
|
45
44
|
function_name: str = mm_constants.MonitoringFunctionNames.STREAM,
|
|
@@ -212,3 +211,45 @@ def update_model_endpoint_last_request(
|
|
|
212
211
|
endpoint_id=model_endpoint.metadata.uid,
|
|
213
212
|
attributes={EventFieldType.LAST_REQUEST: bumped_last_request},
|
|
214
213
|
)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def calculate_inputs_statistics(
|
|
217
|
+
sample_set_statistics: dict, inputs: pd.DataFrame
|
|
218
|
+
) -> dict:
|
|
219
|
+
"""
|
|
220
|
+
Calculate the inputs data statistics for drift monitoring purpose.
|
|
221
|
+
|
|
222
|
+
:param sample_set_statistics: The sample set (stored end point's dataset to reference) statistics. The bins of the
|
|
223
|
+
histograms of each feature will be used to recalculate the histograms of the inputs.
|
|
224
|
+
:param inputs: The inputs to calculate their statistics and later on - the drift with respect to the
|
|
225
|
+
sample set.
|
|
226
|
+
|
|
227
|
+
:returns: The calculated statistics of the inputs data.
|
|
228
|
+
"""
|
|
229
|
+
|
|
230
|
+
# Use `DFDataInfer` to calculate the statistics over the inputs:
|
|
231
|
+
inputs_statistics = mlrun.data_types.infer.DFDataInfer.get_stats(
|
|
232
|
+
df=inputs,
|
|
233
|
+
options=mlrun.data_types.infer.InferOptions.Histogram,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# Recalculate the histograms over the bins that are set in the sample-set of the end point:
|
|
237
|
+
for feature in inputs_statistics.keys():
|
|
238
|
+
if feature in sample_set_statistics:
|
|
239
|
+
counts, bins = np.histogram(
|
|
240
|
+
inputs[feature].to_numpy(),
|
|
241
|
+
bins=sample_set_statistics[feature]["hist"][1],
|
|
242
|
+
)
|
|
243
|
+
inputs_statistics[feature]["hist"] = [
|
|
244
|
+
counts.tolist(),
|
|
245
|
+
bins.tolist(),
|
|
246
|
+
]
|
|
247
|
+
elif "hist" in inputs_statistics[feature]:
|
|
248
|
+
# Comply with the other common features' histogram length
|
|
249
|
+
mlrun.common.model_monitoring.helpers.pad_hist(
|
|
250
|
+
mlrun.common.model_monitoring.helpers.Histogram(
|
|
251
|
+
inputs_statistics[feature]["hist"]
|
|
252
|
+
)
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
return inputs_statistics
|