mlrun 1.7.0rc4__py3-none-any.whl → 1.7.0rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (47) hide show
  1. mlrun/artifacts/base.py +2 -1
  2. mlrun/artifacts/plots.py +9 -5
  3. mlrun/common/constants.py +1 -0
  4. mlrun/common/schemas/__init__.py +10 -0
  5. mlrun/common/schemas/api_gateway.py +85 -0
  6. mlrun/common/schemas/auth.py +2 -2
  7. mlrun/config.py +19 -4
  8. mlrun/datastore/sources.py +5 -4
  9. mlrun/datastore/targets.py +16 -20
  10. mlrun/db/base.py +16 -0
  11. mlrun/db/factory.py +1 -1
  12. mlrun/db/httpdb.py +50 -8
  13. mlrun/db/nopdb.py +13 -0
  14. mlrun/launcher/__init__.py +1 -1
  15. mlrun/launcher/base.py +1 -1
  16. mlrun/launcher/client.py +1 -1
  17. mlrun/launcher/factory.py +1 -1
  18. mlrun/launcher/local.py +1 -1
  19. mlrun/launcher/remote.py +1 -1
  20. mlrun/model_monitoring/api.py +6 -12
  21. mlrun/model_monitoring/application.py +21 -21
  22. mlrun/model_monitoring/applications/histogram_data_drift.py +130 -40
  23. mlrun/model_monitoring/batch.py +1 -42
  24. mlrun/model_monitoring/controller.py +1 -8
  25. mlrun/model_monitoring/features_drift_table.py +34 -22
  26. mlrun/model_monitoring/helpers.py +45 -4
  27. mlrun/model_monitoring/stream_processing.py +2 -0
  28. mlrun/projects/project.py +229 -16
  29. mlrun/run.py +70 -74
  30. mlrun/runtimes/__init__.py +35 -0
  31. mlrun/runtimes/base.py +15 -11
  32. mlrun/runtimes/nuclio/__init__.py +1 -0
  33. mlrun/runtimes/nuclio/api_gateway.py +300 -0
  34. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  35. mlrun/runtimes/nuclio/application/application.py +283 -0
  36. mlrun/runtimes/nuclio/application/reverse_proxy.go +87 -0
  37. mlrun/runtimes/nuclio/function.py +50 -1
  38. mlrun/runtimes/pod.py +1 -1
  39. mlrun/serving/states.py +7 -19
  40. mlrun/utils/logger.py +2 -2
  41. mlrun/utils/version/version.json +2 -2
  42. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc6.dist-info}/METADATA +1 -1
  43. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc6.dist-info}/RECORD +47 -42
  44. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc6.dist-info}/WHEEL +1 -1
  45. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc6.dist-info}/LICENSE +0 -0
  46. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc6.dist-info}/entry_points.txt +0 -0
  47. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc6.dist-info}/top_level.txt +0 -0
@@ -16,13 +16,13 @@ import dataclasses
16
16
  import json
17
17
  import re
18
18
  from abc import ABC, abstractmethod
19
- from typing import Any, Optional, Union
19
+ from typing import Any, Optional, Union, cast
20
20
 
21
21
  import numpy as np
22
22
  import pandas as pd
23
23
 
24
24
  import mlrun.common.helpers
25
- import mlrun.common.schemas.model_monitoring
25
+ import mlrun.common.model_monitoring.helpers
26
26
  import mlrun.common.schemas.model_monitoring.constants as mm_constant
27
27
  import mlrun.utils.v3io_clients
28
28
  from mlrun.datastore import get_stream_pusher
@@ -84,8 +84,8 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
84
84
  class MyApp(ApplicationBase):
85
85
  def do_tracking(
86
86
  self,
87
- sample_df_stats: pd.DataFrame,
88
- feature_stats: pd.DataFrame,
87
+ sample_df_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
88
+ feature_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
89
89
  start_infer_time: pd.Timestamp,
90
90
  end_infer_time: pd.Timestamp,
91
91
  schedule_time: pd.Timestamp,
@@ -93,7 +93,7 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
93
93
  endpoint_id: str,
94
94
  output_stream_uri: str,
95
95
  ) -> ModelMonitoringApplicationResult:
96
- self.context.log_artifact(TableArtifact("sample_df_stats", df=sample_df_stats))
96
+ self.context.log_artifact(TableArtifact("sample_df_stats", df=self.dict_to_histogram(sample_df_stats)))
97
97
  return ModelMonitoringApplicationResult(
98
98
  name="data_drift_test",
99
99
  value=0.5,
@@ -126,14 +126,16 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
126
126
  return results, event
127
127
 
128
128
  def _lazy_init(self, app_name: str):
129
- self.context = self._create_context_for_logging(app_name=app_name)
129
+ self.context = cast(
130
+ mlrun.MLClientCtx, self._create_context_for_logging(app_name=app_name)
131
+ )
130
132
 
131
133
  @abstractmethod
132
134
  def do_tracking(
133
135
  self,
134
136
  application_name: str,
135
- sample_df_stats: pd.DataFrame,
136
- feature_stats: pd.DataFrame,
137
+ sample_df_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
138
+ feature_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
137
139
  sample_df: pd.DataFrame,
138
140
  start_infer_time: pd.Timestamp,
139
141
  end_infer_time: pd.Timestamp,
@@ -147,8 +149,8 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
147
149
  Implement this method with your custom monitoring logic.
148
150
 
149
151
  :param application_name: (str) the app name
150
- :param sample_df_stats: (pd.DataFrame) The new sample distribution DataFrame.
151
- :param feature_stats: (pd.DataFrame) The train sample distribution DataFrame.
152
+ :param sample_df_stats: (FeatureStats) The new sample distribution dictionary.
153
+ :param feature_stats: (FeatureStats) The train sample distribution dictionary.
152
154
  :param sample_df: (pd.DataFrame) The new sample DataFrame.
153
155
  :param start_infer_time: (pd.Timestamp) Start time of the monitoring schedule.
154
156
  :param end_infer_time: (pd.Timestamp) End time of the monitoring schedule.
@@ -167,8 +169,8 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
167
169
  event: dict[str, Any],
168
170
  ) -> tuple[
169
171
  str,
170
- pd.DataFrame,
171
- pd.DataFrame,
172
+ mlrun.common.model_monitoring.helpers.FeatureStats,
173
+ mlrun.common.model_monitoring.helpers.FeatureStats,
172
174
  pd.DataFrame,
173
175
  pd.Timestamp,
174
176
  pd.Timestamp,
@@ -184,8 +186,8 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
184
186
 
185
187
  :return: A tuple of:
186
188
  [0] = (str) application name
187
- [1] = (pd.DataFrame) current input statistics
188
- [2] = (pd.DataFrame) train statistics
189
+ [1] = (dict) current input statistics
190
+ [2] = (dict) train statistics
189
191
  [3] = (pd.DataFrame) current input data
190
192
  [4] = (pd.Timestamp) start time of the monitoring schedule
191
193
  [5] = (pd.Timestamp) end time of the monitoring schedule
@@ -197,12 +199,8 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
197
199
  end_time = pd.Timestamp(event[mm_constant.ApplicationEvent.END_INFER_TIME])
198
200
  return (
199
201
  event[mm_constant.ApplicationEvent.APPLICATION_NAME],
200
- cls._dict_to_histogram(
201
- json.loads(event[mm_constant.ApplicationEvent.CURRENT_STATS])
202
- ),
203
- cls._dict_to_histogram(
204
- json.loads(event[mm_constant.ApplicationEvent.FEATURE_STATS])
205
- ),
202
+ json.loads(event[mm_constant.ApplicationEvent.CURRENT_STATS]),
203
+ json.loads(event[mm_constant.ApplicationEvent.FEATURE_STATS]),
206
204
  ParquetTarget(
207
205
  path=event[mm_constant.ApplicationEvent.SAMPLE_PARQUET_PATH]
208
206
  ).as_df(start_time=start_time, end_time=end_time, time_column="timestamp"),
@@ -223,7 +221,9 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
223
221
  return context
224
222
 
225
223
  @staticmethod
226
- def _dict_to_histogram(histogram_dict: dict[str, dict[str, Any]]) -> pd.DataFrame:
224
+ def dict_to_histogram(
225
+ histogram_dict: mlrun.common.model_monitoring.helpers.FeatureStats,
226
+ ) -> pd.DataFrame:
227
227
  """
228
228
  Convert histogram dictionary to pandas DataFrame with feature histograms as columns
229
229
 
@@ -13,13 +13,17 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from dataclasses import dataclass
16
- from typing import Final, Optional, Protocol
16
+ from typing import Final, Optional, Protocol, cast
17
17
 
18
18
  import numpy as np
19
- from pandas import DataFrame, Timestamp
19
+ from pandas import DataFrame, Series, Timestamp
20
20
 
21
+ import mlrun.artifacts
22
+ import mlrun.common.model_monitoring.helpers
23
+ import mlrun.model_monitoring.features_drift_table as mm_drift_table
21
24
  from mlrun.common.schemas.model_monitoring.constants import (
22
25
  MLRUN_HISTOGRAM_DATA_DRIFT_APP_NAME,
26
+ EventFieldType,
23
27
  ResultKindApp,
24
28
  ResultStatusApp,
25
29
  )
@@ -27,7 +31,7 @@ from mlrun.model_monitoring.application import (
27
31
  ModelMonitoringApplicationBase,
28
32
  ModelMonitoringApplicationResult,
29
33
  )
30
- from mlrun.model_monitoring.batch import (
34
+ from mlrun.model_monitoring.metrics.histogram_distance import (
31
35
  HellingerDistance,
32
36
  HistogramDistanceMetric,
33
37
  KullbackLeiblerDivergence,
@@ -115,31 +119,24 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
115
119
 
116
120
  def _compute_metrics_per_feature(
117
121
  self, sample_df_stats: DataFrame, feature_stats: DataFrame
118
- ) -> dict[type[HistogramDistanceMetric], list[float]]:
122
+ ) -> DataFrame:
119
123
  """Compute the metrics for the different features and labels"""
120
- metrics_per_feature: dict[type[HistogramDistanceMetric], list[float]] = {
121
- metric_class: [] for metric_class in self.metrics
122
- }
124
+ metrics_per_feature = DataFrame(
125
+ columns=[metric_class.NAME for metric_class in self.metrics]
126
+ )
123
127
 
124
- for (sample_feat, sample_hist), (reference_feat, reference_hist) in zip(
125
- sample_df_stats.items(), feature_stats.items()
126
- ):
127
- assert sample_feat == reference_feat, "The features do not match"
128
+ for feature_name in feature_stats:
129
+ sample_hist = np.asarray(sample_df_stats[feature_name])
130
+ reference_hist = np.asarray(feature_stats[feature_name])
128
131
  self.context.logger.info(
129
- "Computing metrics for feature", feature_name=sample_feat
132
+ "Computing metrics for feature", feature_name=feature_name
130
133
  )
131
- sample_arr = np.asarray(sample_hist)
132
- reference_arr = np.asarray(reference_hist)
133
- for metric in self.metrics:
134
- metric_name = metric.NAME
135
- self.context.logger.debug(
136
- "Computing data drift metric",
137
- metric_name=metric_name,
138
- feature_name=sample_feat,
139
- )
140
- metrics_per_feature[metric].append(
141
- metric(distrib_t=sample_arr, distrib_u=reference_arr).compute()
142
- )
134
+ metrics_per_feature.loc[feature_name] = { # pyright: ignore[reportCallIssue,reportArgumentType]
135
+ metric.NAME: metric(
136
+ distrib_t=sample_hist, distrib_u=reference_hist
137
+ ).compute()
138
+ for metric in self.metrics
139
+ }
143
140
  self.context.logger.info("Finished computing the metrics")
144
141
 
145
142
  return metrics_per_feature
@@ -147,37 +144,37 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
147
144
  def _add_general_drift_result(
148
145
  self, results: list[ModelMonitoringApplicationResult], value: float
149
146
  ) -> None:
147
+ """Add the general drift result to the results list and log it"""
148
+ status = self._value_classifier.value_to_status(value)
150
149
  results.append(
151
150
  ModelMonitoringApplicationResult(
152
151
  name="general_drift",
153
152
  value=value,
154
153
  kind=self.METRIC_KIND,
155
- status=self._value_classifier.value_to_status(value),
154
+ status=status,
156
155
  )
157
156
  )
158
157
 
159
158
  def _get_results(
160
- self, metrics_per_feature: dict[type[HistogramDistanceMetric], list[float]]
159
+ self, metrics_per_feature: DataFrame
161
160
  ) -> list[ModelMonitoringApplicationResult]:
162
161
  """Average the metrics over the features and add the status"""
163
162
  results: list[ModelMonitoringApplicationResult] = []
164
- hellinger_tvd_values: list[float] = []
165
- for metric_class, metric_values in metrics_per_feature.items():
166
- self.context.logger.debug(
167
- "Averaging metric over the features", metric_name=metric_class.NAME
168
- )
169
- value = np.mean(metric_values)
170
- if metric_class == KullbackLeiblerDivergence:
163
+
164
+ self.context.logger.debug("Averaging metrics over the features")
165
+ metrics_mean = metrics_per_feature.mean().to_dict()
166
+
167
+ self.context.logger.debug("Creating the results")
168
+ for name, value in metrics_mean.items():
169
+ if name == KullbackLeiblerDivergence.NAME:
171
170
  # This metric is not bounded from above [0, inf).
172
171
  # No status is currently reported for KL divergence
173
172
  status = ResultStatusApp.irrelevant
174
173
  else:
175
174
  status = self._value_classifier.value_to_status(value)
176
- if metric_class in self._REQUIRED_METRICS:
177
- hellinger_tvd_values.append(value)
178
175
  results.append(
179
176
  ModelMonitoringApplicationResult(
180
- name=f"{metric_class.NAME}_mean",
177
+ name=f"{name}_mean",
181
178
  value=value,
182
179
  kind=self.METRIC_KIND,
183
180
  status=status,
@@ -185,16 +182,102 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
185
182
  )
186
183
 
187
184
  self._add_general_drift_result(
188
- results=results, value=np.mean(hellinger_tvd_values)
185
+ results=results,
186
+ value=np.mean(
187
+ [
188
+ metrics_mean[HellingerDistance.NAME],
189
+ metrics_mean[TotalVarianceDistance.NAME],
190
+ ]
191
+ ),
189
192
  )
190
193
 
194
+ self.context.logger.info("Finished with the results")
191
195
  return results
192
196
 
197
+ @staticmethod
198
+ def _remove_timestamp_feature(
199
+ sample_set_statistics: mlrun.common.model_monitoring.helpers.FeatureStats,
200
+ ) -> mlrun.common.model_monitoring.helpers.FeatureStats:
201
+ """
202
+ Drop the 'timestamp' feature if it exists, as it is irrelevant
203
+ in the plotly artifact
204
+ """
205
+ sample_set_statistics = mlrun.common.model_monitoring.helpers.FeatureStats(
206
+ sample_set_statistics.copy()
207
+ )
208
+ if EventFieldType.TIMESTAMP in sample_set_statistics:
209
+ del sample_set_statistics[EventFieldType.TIMESTAMP]
210
+ return sample_set_statistics
211
+
212
+ def _log_json_artifact(self, drift_per_feature_values: Series) -> None:
213
+ """Log the drift values as a JSON artifact"""
214
+ self.context.logger.debug("Logging drift value per feature JSON artifact")
215
+ self.context.log_artifact(
216
+ mlrun.artifacts.Artifact(
217
+ body=drift_per_feature_values.to_json(),
218
+ format="json",
219
+ key="features_drift_results",
220
+ )
221
+ )
222
+ self.context.logger.debug("Logged JSON artifact successfully")
223
+
224
+ def _log_plotly_table_artifact(
225
+ self,
226
+ sample_set_statistics: mlrun.common.model_monitoring.helpers.FeatureStats,
227
+ inputs_statistics: mlrun.common.model_monitoring.helpers.FeatureStats,
228
+ metrics_per_feature: DataFrame,
229
+ drift_per_feature_values: Series,
230
+ ) -> None:
231
+ """Log the Plotly drift table artifact"""
232
+ self.context.logger.debug(
233
+ "Feature stats",
234
+ sample_set_statistics=sample_set_statistics,
235
+ inputs_statistics=inputs_statistics,
236
+ )
237
+
238
+ self.context.logger.debug("Computing drift results per feature")
239
+ drift_results = {
240
+ cast(str, key): (self._value_classifier.value_to_status(value), value)
241
+ for key, value in drift_per_feature_values.items()
242
+ }
243
+ self.context.logger.debug("Logging plotly artifact")
244
+ self.context.log_artifact(
245
+ mm_drift_table.FeaturesDriftTablePlot().produce(
246
+ sample_set_statistics=sample_set_statistics,
247
+ inputs_statistics=inputs_statistics,
248
+ metrics=metrics_per_feature.T.to_dict(),
249
+ drift_results=drift_results,
250
+ )
251
+ )
252
+ self.context.logger.debug("Logged plotly artifact successfully")
253
+
254
+ def _log_drift_artifacts(
255
+ self,
256
+ sample_set_statistics: mlrun.common.model_monitoring.helpers.FeatureStats,
257
+ inputs_statistics: mlrun.common.model_monitoring.helpers.FeatureStats,
258
+ metrics_per_feature: DataFrame,
259
+ log_json_artifact: bool = True,
260
+ ) -> None:
261
+ """Log JSON and Plotly drift data per feature artifacts"""
262
+ drift_per_feature_values = metrics_per_feature[
263
+ [HellingerDistance.NAME, TotalVarianceDistance.NAME]
264
+ ].mean(axis=1)
265
+
266
+ if log_json_artifact:
267
+ self._log_json_artifact(drift_per_feature_values)
268
+
269
+ self._log_plotly_table_artifact(
270
+ sample_set_statistics=self._remove_timestamp_feature(sample_set_statistics),
271
+ inputs_statistics=inputs_statistics,
272
+ metrics_per_feature=metrics_per_feature,
273
+ drift_per_feature_values=drift_per_feature_values,
274
+ )
275
+
193
276
  def do_tracking(
194
277
  self,
195
278
  application_name: str,
196
- sample_df_stats: DataFrame,
197
- feature_stats: DataFrame,
279
+ sample_df_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
280
+ feature_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
198
281
  sample_df: DataFrame,
199
282
  start_infer_time: Timestamp,
200
283
  end_infer_time: Timestamp,
@@ -210,7 +293,14 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
210
293
  """
211
294
  self.context.logger.debug("Starting to run the application")
212
295
  metrics_per_feature = self._compute_metrics_per_feature(
213
- sample_df_stats=sample_df_stats, feature_stats=feature_stats
296
+ sample_df_stats=self.dict_to_histogram(sample_df_stats),
297
+ feature_stats=self.dict_to_histogram(feature_stats),
298
+ )
299
+ self.context.logger.debug("Saving artifacts")
300
+ self._log_drift_artifacts(
301
+ inputs_statistics=feature_stats,
302
+ sample_set_statistics=sample_df_stats,
303
+ metrics_per_feature=metrics_per_feature,
214
304
  )
215
305
  self.context.logger.debug("Computing average per metric")
216
306
  results = self._get_results(metrics_per_feature)
@@ -33,6 +33,7 @@ import mlrun.common.schemas.model_monitoring
33
33
  import mlrun.data_types.infer
34
34
  import mlrun.feature_store as fstore
35
35
  import mlrun.utils.v3io_clients
36
+ from mlrun.model_monitoring.helpers import calculate_inputs_statistics
36
37
  from mlrun.model_monitoring.metrics.histogram_distance import (
37
38
  HellingerDistance,
38
39
  HistogramDistanceMetric,
@@ -353,48 +354,6 @@ class VirtualDrift:
353
354
  return drift_status
354
355
 
355
356
 
356
- def calculate_inputs_statistics(
357
- sample_set_statistics: dict, inputs: pd.DataFrame
358
- ) -> dict:
359
- """
360
- Calculate the inputs data statistics for drift monitoring purpose.
361
-
362
- :param sample_set_statistics: The sample set (stored end point's dataset to reference) statistics. The bins of the
363
- histograms of each feature will be used to recalculate the histograms of the inputs.
364
- :param inputs: The inputs to calculate their statistics and later on - the drift with respect to the
365
- sample set.
366
-
367
- :returns: The calculated statistics of the inputs data.
368
- """
369
-
370
- # Use `DFDataInfer` to calculate the statistics over the inputs:
371
- inputs_statistics = mlrun.data_types.infer.DFDataInfer.get_stats(
372
- df=inputs,
373
- options=mlrun.data_types.infer.InferOptions.Histogram,
374
- )
375
-
376
- # Recalculate the histograms over the bins that are set in the sample-set of the end point:
377
- for feature in inputs_statistics.keys():
378
- if feature in sample_set_statistics:
379
- counts, bins = np.histogram(
380
- inputs[feature].to_numpy(),
381
- bins=sample_set_statistics[feature]["hist"][1],
382
- )
383
- inputs_statistics[feature]["hist"] = [
384
- counts.tolist(),
385
- bins.tolist(),
386
- ]
387
- elif "hist" in inputs_statistics[feature]:
388
- # Comply with the other common features' histogram length
389
- mlrun.common.model_monitoring.helpers.pad_hist(
390
- mlrun.common.model_monitoring.helpers.Histogram(
391
- inputs_statistics[feature]["hist"]
392
- )
393
- )
394
-
395
- return inputs_statistics
396
-
397
-
398
357
  class BatchProcessor:
399
358
  """
400
359
  The main object to handle the batch processing job. This object is used to get the required configurations and
@@ -31,10 +31,10 @@ from mlrun.common.model_monitoring.helpers import FeatureStats, pad_features_his
31
31
  from mlrun.datastore import get_stream_pusher
32
32
  from mlrun.datastore.targets import ParquetTarget
33
33
  from mlrun.errors import err_to_str
34
- from mlrun.model_monitoring.batch import calculate_inputs_statistics
35
34
  from mlrun.model_monitoring.helpers import (
36
35
  _BatchDict,
37
36
  batch_dict2timedelta,
37
+ calculate_inputs_statistics,
38
38
  get_monitoring_parquet_path,
39
39
  get_stream_path,
40
40
  )
@@ -445,13 +445,6 @@ class MonitoringApplicationController:
445
445
  m_fs = fstore.get_feature_set(
446
446
  endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
447
447
  )
448
- labels = endpoint[mm_constants.EventFieldType.LABEL_NAMES]
449
- if labels:
450
- if isinstance(labels, str):
451
- labels = json.loads(labels)
452
- for label in labels:
453
- if label not in list(m_fs.spec.features.keys()):
454
- m_fs.add_feature(fstore.Feature(name=label, value_type="float"))
455
448
 
456
449
  for application in applications_names:
457
450
  batch_window = batch_window_generator.get_batch_window(
@@ -21,9 +21,34 @@ import plotly.graph_objects as go
21
21
  from plotly.subplots import make_subplots
22
22
 
23
23
  import mlrun.common.schemas.model_monitoring
24
+ from mlrun.artifacts import PlotlyArtifact
24
25
 
25
26
  # A type for representing a drift result, a tuple of the status and the drift mean:
26
- DriftResultType = tuple[mlrun.common.schemas.model_monitoring.DriftStatus, float]
27
+ DriftResultType = tuple[
28
+ mlrun.common.schemas.model_monitoring.constants.ResultStatusApp, float
29
+ ]
30
+
31
+
32
+ class _PlotlyTableArtifact(PlotlyArtifact):
33
+ """A custom class for plotly table artifacts"""
34
+
35
+ @staticmethod
36
+ def _disable_table_dragging(figure_html: str) -> str:
37
+ """
38
+ Disable the table columns dragging by adding the following
39
+ JavaScript code
40
+ """
41
+ start, end = figure_html.rsplit(";", 1)
42
+ middle = (
43
+ ';for (const element of document.getElementsByClassName("table")) '
44
+ '{element.style.pointerEvents = "none";}'
45
+ )
46
+ figure_html = start + middle + end
47
+ return figure_html
48
+
49
+ def get_body(self) -> str:
50
+ """Get the adjusted HTML representation of the figure"""
51
+ return self._disable_table_dragging(super().get_body())
27
52
 
28
53
 
29
54
  class FeaturesDriftTablePlot:
@@ -62,9 +87,9 @@ class FeaturesDriftTablePlot:
62
87
 
63
88
  # Status configurations:
64
89
  _STATUS_COLORS = {
65
- mlrun.common.schemas.model_monitoring.DriftStatus.NO_DRIFT: "rgb(0,176,80)", # Green
66
- mlrun.common.schemas.model_monitoring.DriftStatus.POSSIBLE_DRIFT: "rgb(255,192,0)", # Orange
67
- mlrun.common.schemas.model_monitoring.DriftStatus.DRIFT_DETECTED: "rgb(208,0,106)", # Magenta
90
+ mlrun.common.schemas.model_monitoring.constants.ResultStatusApp.no_detection: "rgb(0,176,80)", # Green
91
+ mlrun.common.schemas.model_monitoring.constants.ResultStatusApp.potential_detection: "rgb(255,192,0)", # Orange
92
+ mlrun.common.schemas.model_monitoring.constants.ResultStatusApp.detected: "rgb(208,0,106)", # Magenta
68
93
  }
69
94
 
70
95
  # Font configurations:
@@ -97,7 +122,7 @@ class FeaturesDriftTablePlot:
97
122
  inputs_statistics: dict,
98
123
  metrics: dict[str, Union[dict, float]],
99
124
  drift_results: dict[str, DriftResultType],
100
- ) -> str:
125
+ ) -> _PlotlyTableArtifact:
101
126
  """
102
127
  Produce the html code of the table plot with the given information and the stored configurations in the class.
103
128
 
@@ -106,9 +131,8 @@ class FeaturesDriftTablePlot:
106
131
  :param metrics: The drift detection metrics calculated on the sample set and inputs.
107
132
  :param drift_results: The drift results per feature according to the rules of the monitor.
108
133
 
109
- :return: The full path to the html file of the plot.
134
+ :return: The drift table as a plotly artifact.
110
135
  """
111
- # Plot the drift table:
112
136
  figure = self._plot(
113
137
  features=list(inputs_statistics.keys()),
114
138
  sample_set_statistics=sample_set_statistics,
@@ -116,19 +140,7 @@ class FeaturesDriftTablePlot:
116
140
  metrics=metrics,
117
141
  drift_results=drift_results,
118
142
  )
119
-
120
- # Get its HTML representation:
121
- figure_html = figure.to_html()
122
-
123
- # Turn off the table columns dragging by injecting the following JavaScript code:
124
- start, end = figure_html.rsplit(";", 1)
125
- middle = (
126
- ';for (const element of document.getElementsByClassName("table")) '
127
- '{element.style.pointerEvents = "none";}'
128
- )
129
- figure_html = start + middle + end
130
-
131
- return figure_html
143
+ return _PlotlyTableArtifact(figure=figure, key="drift_table_plot")
132
144
 
133
145
  def _read_columns_names(self, statistics_dictionary: dict, drift_metrics: dict):
134
146
  """
@@ -366,10 +378,10 @@ class FeaturesDriftTablePlot:
366
378
  bins = np.array(bins)
367
379
  if bins[0] == -sys.float_info.max:
368
380
  bins[0] = bins[1] - (bins[2] - bins[1])
369
- hovertext[0] = f"(-∞, {bins[1]})"
381
+ hovertext[0] = f"(-inf, {bins[1]})"
370
382
  if bins[-1] == sys.float_info.max:
371
383
  bins[-1] = bins[-2] + (bins[-2] - bins[-3])
372
- hovertext[-1] = f"({bins[-2]}, )"
384
+ hovertext[-1] = f"({bins[-2]}, inf)"
373
385
  # Center the bins (leave the first one):
374
386
  bins = 0.5 * (bins[:-1] + bins[1:])
375
387
  # Plot the histogram as a line with filled background below it:
@@ -15,6 +15,9 @@
15
15
  import datetime
16
16
  import typing
17
17
 
18
+ import numpy as np
19
+ import pandas as pd
20
+
18
21
  import mlrun
19
22
  import mlrun.common.model_monitoring.helpers
20
23
  import mlrun.common.schemas
@@ -36,10 +39,6 @@ class _BatchDict(typing.TypedDict):
36
39
  days: int
37
40
 
38
41
 
39
- class _MLRunNoRunsFoundError(Exception):
40
- pass
41
-
42
-
43
42
  def get_stream_path(
44
43
  project: str = None,
45
44
  function_name: str = mm_constants.MonitoringFunctionNames.STREAM,
@@ -212,3 +211,45 @@ def update_model_endpoint_last_request(
212
211
  endpoint_id=model_endpoint.metadata.uid,
213
212
  attributes={EventFieldType.LAST_REQUEST: bumped_last_request},
214
213
  )
214
+
215
+
216
+ def calculate_inputs_statistics(
217
+ sample_set_statistics: dict, inputs: pd.DataFrame
218
+ ) -> dict:
219
+ """
220
+ Calculate the inputs data statistics for drift monitoring purpose.
221
+
222
+ :param sample_set_statistics: The sample set (stored end point's dataset to reference) statistics. The bins of the
223
+ histograms of each feature will be used to recalculate the histograms of the inputs.
224
+ :param inputs: The inputs to calculate their statistics and later on - the drift with respect to the
225
+ sample set.
226
+
227
+ :returns: The calculated statistics of the inputs data.
228
+ """
229
+
230
+ # Use `DFDataInfer` to calculate the statistics over the inputs:
231
+ inputs_statistics = mlrun.data_types.infer.DFDataInfer.get_stats(
232
+ df=inputs,
233
+ options=mlrun.data_types.infer.InferOptions.Histogram,
234
+ )
235
+
236
+ # Recalculate the histograms over the bins that are set in the sample-set of the end point:
237
+ for feature in inputs_statistics.keys():
238
+ if feature in sample_set_statistics:
239
+ counts, bins = np.histogram(
240
+ inputs[feature].to_numpy(),
241
+ bins=sample_set_statistics[feature]["hist"][1],
242
+ )
243
+ inputs_statistics[feature]["hist"] = [
244
+ counts.tolist(),
245
+ bins.tolist(),
246
+ ]
247
+ elif "hist" in inputs_statistics[feature]:
248
+ # Comply with the other common features' histogram length
249
+ mlrun.common.model_monitoring.helpers.pad_hist(
250
+ mlrun.common.model_monitoring.helpers.Histogram(
251
+ inputs_statistics[feature]["hist"]
252
+ )
253
+ )
254
+
255
+ return inputs_statistics
@@ -587,6 +587,8 @@ class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
587
587
  for key in [
588
588
  EventFieldType.FEATURES,
589
589
  EventFieldType.NAMED_FEATURES,
590
+ EventFieldType.PREDICTION,
591
+ EventFieldType.NAMED_PREDICTIONS,
590
592
  ]:
591
593
  event.pop(key, None)
592
594