mlrun 1.7.0rc5__py3-none-any.whl → 1.7.0rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (36) hide show
  1. mlrun/artifacts/base.py +2 -1
  2. mlrun/artifacts/plots.py +9 -5
  3. mlrun/config.py +8 -2
  4. mlrun/datastore/sources.py +5 -4
  5. mlrun/db/factory.py +1 -1
  6. mlrun/launcher/__init__.py +1 -1
  7. mlrun/launcher/base.py +1 -1
  8. mlrun/launcher/client.py +1 -1
  9. mlrun/launcher/factory.py +1 -1
  10. mlrun/launcher/local.py +1 -1
  11. mlrun/launcher/remote.py +1 -1
  12. mlrun/model_monitoring/api.py +6 -12
  13. mlrun/model_monitoring/application.py +21 -21
  14. mlrun/model_monitoring/applications/histogram_data_drift.py +130 -40
  15. mlrun/model_monitoring/batch.py +1 -42
  16. mlrun/model_monitoring/controller.py +1 -8
  17. mlrun/model_monitoring/features_drift_table.py +34 -22
  18. mlrun/model_monitoring/helpers.py +45 -4
  19. mlrun/model_monitoring/stream_processing.py +2 -0
  20. mlrun/projects/project.py +170 -16
  21. mlrun/run.py +69 -73
  22. mlrun/runtimes/__init__.py +35 -0
  23. mlrun/runtimes/base.py +1 -1
  24. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  25. mlrun/runtimes/nuclio/application/application.py +283 -0
  26. mlrun/runtimes/nuclio/application/reverse_proxy.go +87 -0
  27. mlrun/runtimes/nuclio/function.py +50 -1
  28. mlrun/runtimes/pod.py +1 -1
  29. mlrun/serving/states.py +7 -19
  30. mlrun/utils/version/version.json +2 -2
  31. {mlrun-1.7.0rc5.dist-info → mlrun-1.7.0rc6.dist-info}/METADATA +1 -1
  32. {mlrun-1.7.0rc5.dist-info → mlrun-1.7.0rc6.dist-info}/RECORD +36 -33
  33. {mlrun-1.7.0rc5.dist-info → mlrun-1.7.0rc6.dist-info}/LICENSE +0 -0
  34. {mlrun-1.7.0rc5.dist-info → mlrun-1.7.0rc6.dist-info}/WHEEL +0 -0
  35. {mlrun-1.7.0rc5.dist-info → mlrun-1.7.0rc6.dist-info}/entry_points.txt +0 -0
  36. {mlrun-1.7.0rc5.dist-info → mlrun-1.7.0rc6.dist-info}/top_level.txt +0 -0
mlrun/artifacts/base.py CHANGED
@@ -88,9 +88,10 @@ class ArtifactSpec(ModelObj):
88
88
  "db_key",
89
89
  "extra_data",
90
90
  "unpackaging_instructions",
91
+ "producer",
91
92
  ]
92
93
 
93
- _extra_fields = ["annotations", "producer", "sources", "license", "encoding"]
94
+ _extra_fields = ["annotations", "sources", "license", "encoding"]
94
95
  _exclude_fields_from_uid_hash = [
95
96
  # if the artifact is first created, it will not have a db_key,
96
97
  # exclude it so further updates of the artifacts will have the same hash
mlrun/artifacts/plots.py CHANGED
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import base64
15
+ import typing
15
16
  from io import BytesIO
16
17
 
17
18
  from deprecated import deprecated
@@ -21,6 +22,9 @@ import mlrun
21
22
  from ..utils import dict_to_json
22
23
  from .base import Artifact, LegacyArtifact
23
24
 
25
+ if typing.TYPE_CHECKING:
26
+ from plotly.graph_objs import Figure
27
+
24
28
 
25
29
  class PlotArtifact(Artifact):
26
30
  kind = "plot"
@@ -207,10 +211,10 @@ class PlotlyArtifact(Artifact):
207
211
 
208
212
  def __init__(
209
213
  self,
210
- figure=None,
211
- key: str = None,
212
- target_path: str = None,
213
- ):
214
+ figure: typing.Optional["Figure"] = None,
215
+ key: typing.Optional[str] = None,
216
+ target_path: typing.Optional[str] = None,
217
+ ) -> None:
214
218
  """
215
219
  Initialize a Plotly artifact with the given figure.
216
220
 
@@ -247,7 +251,7 @@ class PlotlyArtifact(Artifact):
247
251
  self._figure = figure
248
252
  self.spec.format = "html"
249
253
 
250
- def get_body(self):
254
+ def get_body(self) -> str:
251
255
  """
252
256
  Get the artifact's body - the Plotly figure's html code.
253
257
 
mlrun/config.py CHANGED
@@ -324,7 +324,13 @@ default_config = {
324
324
  # optional values (as per https://dev.mysql.com/doc/refman/8.0/en/sql-mode.html#sql-mode-full):
325
325
  #
326
326
  # if set to "nil" or "none", nothing would be set
327
- "modes": "STRICT_TRANS_TABLES",
327
+ "modes": (
328
+ "STRICT_TRANS_TABLES"
329
+ ",NO_ZERO_IN_DATE"
330
+ ",NO_ZERO_DATE"
331
+ ",ERROR_FOR_DIVISION_BY_ZERO"
332
+ ",NO_ENGINE_SUBSTITUTION",
333
+ )
328
334
  },
329
335
  },
330
336
  "jobs": {
@@ -443,7 +449,7 @@ default_config = {
443
449
  # pip install <requirement_specifier>, e.g. mlrun==0.5.4, mlrun~=0.5,
444
450
  # git+https://github.com/mlrun/mlrun@development. by default uses the version
445
451
  "mlrun_version_specifier": "",
446
- "kaniko_image": "gcr.io/kaniko-project/executor:v1.8.0", # kaniko builder image
452
+ "kaniko_image": "gcr.io/kaniko-project/executor:v1.21.1", # kaniko builder image
447
453
  "kaniko_init_container_image": "alpine:3.18",
448
454
  # image for kaniko init container when docker registry is ECR
449
455
  "kaniko_aws_cli_image": "amazon/aws-cli:2.7.10",
@@ -204,11 +204,11 @@ class CSVSource(BaseSourceDriver):
204
204
  )
205
205
 
206
206
  def get_spark_options(self):
207
- store, path, url = mlrun.store_manager.get_or_create_store(self.path)
207
+ store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
208
208
  spark_options = store.get_spark_options()
209
209
  spark_options.update(
210
210
  {
211
- "path": url,
211
+ "path": store.spark_url + path,
212
212
  "format": "csv",
213
213
  "header": "true",
214
214
  "inferSchema": "true",
@@ -357,7 +357,7 @@ class ParquetSource(BaseSourceDriver):
357
357
  )
358
358
 
359
359
  def get_spark_options(self):
360
- store, path, url = mlrun.store_manager.get_or_create_store(self.path)
360
+ store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
361
361
  spark_options = store.get_spark_options()
362
362
  spark_options.update(
363
363
  {
@@ -794,7 +794,8 @@ class OnlineSource(BaseSourceDriver):
794
794
  explicit_ack = (
795
795
  is_explicit_ack_supported(context) and mlrun.mlconf.is_explicit_ack()
796
796
  )
797
- src_class = storey.AsyncEmitSource(
797
+ # TODO: Change to AsyncEmitSource once we can drop support for nuclio<1.12.10
798
+ src_class = storey.SyncEmitSource(
798
799
  context=context,
799
800
  key_field=self.key_field or key_field,
800
801
  full_event=True,
mlrun/db/factory.py CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright 2023 MLRun Authors
1
+ # Copyright 2023 Iguazio
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -1,4 +1,4 @@
1
- # Copyright 2023 MLRun Authors
1
+ # Copyright 2023 Iguazio
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
mlrun/launcher/base.py CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright 2023 MLRun Authors
1
+ # Copyright 2023 Iguazio
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
mlrun/launcher/client.py CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright 2023 MLRun Authors
1
+ # Copyright 2023 Iguazio
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
mlrun/launcher/factory.py CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright 2023 MLRun Authors
1
+ # Copyright 2023 Iguazio
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
mlrun/launcher/local.py CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright 2023 MLRun Authors
1
+ # Copyright 2023 Iguazio
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
mlrun/launcher/remote.py CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright 2023 MLRun Authors
1
+ # Copyright 2023 Iguazio
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -704,8 +704,8 @@ def perform_drift_analysis(
704
704
  drift_detected_threshold=drift_threshold,
705
705
  )
706
706
 
707
- # Drift table plot
708
- html_plot = FeaturesDriftTablePlot().produce(
707
+ # Drift table artifact
708
+ plotly_artifact = FeaturesDriftTablePlot().produce(
709
709
  sample_set_statistics=sample_set_statistics,
710
710
  inputs_statistics=inputs_statistics,
711
711
  metrics=metrics,
@@ -732,7 +732,7 @@ def perform_drift_analysis(
732
732
  # Log the different artifacts
733
733
  _log_drift_artifacts(
734
734
  context=context,
735
- html_plot=html_plot,
735
+ plotly_artifact=plotly_artifact,
736
736
  metrics_per_feature=metrics_per_feature,
737
737
  drift_status=drift_status,
738
738
  drift_metric=drift_metric,
@@ -742,7 +742,7 @@ def perform_drift_analysis(
742
742
 
743
743
  def _log_drift_artifacts(
744
744
  context: mlrun.MLClientCtx,
745
- html_plot: str,
745
+ plotly_artifact: mlrun.artifacts.Artifact,
746
746
  metrics_per_feature: dict[str, float],
747
747
  drift_status: bool,
748
748
  drift_metric: float,
@@ -755,20 +755,14 @@ def _log_drift_artifacts(
755
755
  3 - Results of the total drift analysis
756
756
 
757
757
  :param context: MLRun context. Will log the artifacts.
758
- :param html_plot: Body of the html file of the plot.
758
+ :param plotly_artifact: The plotly artifact.
759
759
  :param metrics_per_feature: Dictionary in which the key is a feature name and the value is the drift numerical
760
760
  result.
761
761
  :param drift_status: Boolean value that represents the final drift analysis result.
762
762
  :param drift_metric: The final drift numerical result.
763
763
  :param artifacts_tag: Tag to use for all the artifacts resulted from the function.
764
-
765
764
  """
766
- context.log_artifact(
767
- mlrun.artifacts.Artifact(
768
- body=html_plot.encode("utf-8"), format="html", key="drift_table_plot"
769
- ),
770
- tag=artifacts_tag,
771
- )
765
+ context.log_artifact(plotly_artifact, tag=artifacts_tag)
772
766
  context.log_artifact(
773
767
  mlrun.artifacts.Artifact(
774
768
  body=json.dumps(metrics_per_feature),
@@ -16,13 +16,13 @@ import dataclasses
16
16
  import json
17
17
  import re
18
18
  from abc import ABC, abstractmethod
19
- from typing import Any, Optional, Union
19
+ from typing import Any, Optional, Union, cast
20
20
 
21
21
  import numpy as np
22
22
  import pandas as pd
23
23
 
24
24
  import mlrun.common.helpers
25
- import mlrun.common.schemas.model_monitoring
25
+ import mlrun.common.model_monitoring.helpers
26
26
  import mlrun.common.schemas.model_monitoring.constants as mm_constant
27
27
  import mlrun.utils.v3io_clients
28
28
  from mlrun.datastore import get_stream_pusher
@@ -84,8 +84,8 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
84
84
  class MyApp(ApplicationBase):
85
85
  def do_tracking(
86
86
  self,
87
- sample_df_stats: pd.DataFrame,
88
- feature_stats: pd.DataFrame,
87
+ sample_df_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
88
+ feature_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
89
89
  start_infer_time: pd.Timestamp,
90
90
  end_infer_time: pd.Timestamp,
91
91
  schedule_time: pd.Timestamp,
@@ -93,7 +93,7 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
93
93
  endpoint_id: str,
94
94
  output_stream_uri: str,
95
95
  ) -> ModelMonitoringApplicationResult:
96
- self.context.log_artifact(TableArtifact("sample_df_stats", df=sample_df_stats))
96
+ self.context.log_artifact(TableArtifact("sample_df_stats", df=self.dict_to_histogram(sample_df_stats)))
97
97
  return ModelMonitoringApplicationResult(
98
98
  name="data_drift_test",
99
99
  value=0.5,
@@ -126,14 +126,16 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
126
126
  return results, event
127
127
 
128
128
  def _lazy_init(self, app_name: str):
129
- self.context = self._create_context_for_logging(app_name=app_name)
129
+ self.context = cast(
130
+ mlrun.MLClientCtx, self._create_context_for_logging(app_name=app_name)
131
+ )
130
132
 
131
133
  @abstractmethod
132
134
  def do_tracking(
133
135
  self,
134
136
  application_name: str,
135
- sample_df_stats: pd.DataFrame,
136
- feature_stats: pd.DataFrame,
137
+ sample_df_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
138
+ feature_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
137
139
  sample_df: pd.DataFrame,
138
140
  start_infer_time: pd.Timestamp,
139
141
  end_infer_time: pd.Timestamp,
@@ -147,8 +149,8 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
147
149
  Implement this method with your custom monitoring logic.
148
150
 
149
151
  :param application_name: (str) the app name
150
- :param sample_df_stats: (pd.DataFrame) The new sample distribution DataFrame.
151
- :param feature_stats: (pd.DataFrame) The train sample distribution DataFrame.
152
+ :param sample_df_stats: (FeatureStats) The new sample distribution dictionary.
153
+ :param feature_stats: (FeatureStats) The train sample distribution dictionary.
152
154
  :param sample_df: (pd.DataFrame) The new sample DataFrame.
153
155
  :param start_infer_time: (pd.Timestamp) Start time of the monitoring schedule.
154
156
  :param end_infer_time: (pd.Timestamp) End time of the monitoring schedule.
@@ -167,8 +169,8 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
167
169
  event: dict[str, Any],
168
170
  ) -> tuple[
169
171
  str,
170
- pd.DataFrame,
171
- pd.DataFrame,
172
+ mlrun.common.model_monitoring.helpers.FeatureStats,
173
+ mlrun.common.model_monitoring.helpers.FeatureStats,
172
174
  pd.DataFrame,
173
175
  pd.Timestamp,
174
176
  pd.Timestamp,
@@ -184,8 +186,8 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
184
186
 
185
187
  :return: A tuple of:
186
188
  [0] = (str) application name
187
- [1] = (pd.DataFrame) current input statistics
188
- [2] = (pd.DataFrame) train statistics
189
+ [1] = (dict) current input statistics
190
+ [2] = (dict) train statistics
189
191
  [3] = (pd.DataFrame) current input data
190
192
  [4] = (pd.Timestamp) start time of the monitoring schedule
191
193
  [5] = (pd.Timestamp) end time of the monitoring schedule
@@ -197,12 +199,8 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
197
199
  end_time = pd.Timestamp(event[mm_constant.ApplicationEvent.END_INFER_TIME])
198
200
  return (
199
201
  event[mm_constant.ApplicationEvent.APPLICATION_NAME],
200
- cls._dict_to_histogram(
201
- json.loads(event[mm_constant.ApplicationEvent.CURRENT_STATS])
202
- ),
203
- cls._dict_to_histogram(
204
- json.loads(event[mm_constant.ApplicationEvent.FEATURE_STATS])
205
- ),
202
+ json.loads(event[mm_constant.ApplicationEvent.CURRENT_STATS]),
203
+ json.loads(event[mm_constant.ApplicationEvent.FEATURE_STATS]),
206
204
  ParquetTarget(
207
205
  path=event[mm_constant.ApplicationEvent.SAMPLE_PARQUET_PATH]
208
206
  ).as_df(start_time=start_time, end_time=end_time, time_column="timestamp"),
@@ -223,7 +221,9 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
223
221
  return context
224
222
 
225
223
  @staticmethod
226
- def _dict_to_histogram(histogram_dict: dict[str, dict[str, Any]]) -> pd.DataFrame:
224
+ def dict_to_histogram(
225
+ histogram_dict: mlrun.common.model_monitoring.helpers.FeatureStats,
226
+ ) -> pd.DataFrame:
227
227
  """
228
228
  Convert histogram dictionary to pandas DataFrame with feature histograms as columns
229
229
 
@@ -13,13 +13,17 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from dataclasses import dataclass
16
- from typing import Final, Optional, Protocol
16
+ from typing import Final, Optional, Protocol, cast
17
17
 
18
18
  import numpy as np
19
- from pandas import DataFrame, Timestamp
19
+ from pandas import DataFrame, Series, Timestamp
20
20
 
21
+ import mlrun.artifacts
22
+ import mlrun.common.model_monitoring.helpers
23
+ import mlrun.model_monitoring.features_drift_table as mm_drift_table
21
24
  from mlrun.common.schemas.model_monitoring.constants import (
22
25
  MLRUN_HISTOGRAM_DATA_DRIFT_APP_NAME,
26
+ EventFieldType,
23
27
  ResultKindApp,
24
28
  ResultStatusApp,
25
29
  )
@@ -27,7 +31,7 @@ from mlrun.model_monitoring.application import (
27
31
  ModelMonitoringApplicationBase,
28
32
  ModelMonitoringApplicationResult,
29
33
  )
30
- from mlrun.model_monitoring.batch import (
34
+ from mlrun.model_monitoring.metrics.histogram_distance import (
31
35
  HellingerDistance,
32
36
  HistogramDistanceMetric,
33
37
  KullbackLeiblerDivergence,
@@ -115,31 +119,24 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
115
119
 
116
120
  def _compute_metrics_per_feature(
117
121
  self, sample_df_stats: DataFrame, feature_stats: DataFrame
118
- ) -> dict[type[HistogramDistanceMetric], list[float]]:
122
+ ) -> DataFrame:
119
123
  """Compute the metrics for the different features and labels"""
120
- metrics_per_feature: dict[type[HistogramDistanceMetric], list[float]] = {
121
- metric_class: [] for metric_class in self.metrics
122
- }
124
+ metrics_per_feature = DataFrame(
125
+ columns=[metric_class.NAME for metric_class in self.metrics]
126
+ )
123
127
 
124
- for (sample_feat, sample_hist), (reference_feat, reference_hist) in zip(
125
- sample_df_stats.items(), feature_stats.items()
126
- ):
127
- assert sample_feat == reference_feat, "The features do not match"
128
+ for feature_name in feature_stats:
129
+ sample_hist = np.asarray(sample_df_stats[feature_name])
130
+ reference_hist = np.asarray(feature_stats[feature_name])
128
131
  self.context.logger.info(
129
- "Computing metrics for feature", feature_name=sample_feat
132
+ "Computing metrics for feature", feature_name=feature_name
130
133
  )
131
- sample_arr = np.asarray(sample_hist)
132
- reference_arr = np.asarray(reference_hist)
133
- for metric in self.metrics:
134
- metric_name = metric.NAME
135
- self.context.logger.debug(
136
- "Computing data drift metric",
137
- metric_name=metric_name,
138
- feature_name=sample_feat,
139
- )
140
- metrics_per_feature[metric].append(
141
- metric(distrib_t=sample_arr, distrib_u=reference_arr).compute()
142
- )
134
+ metrics_per_feature.loc[feature_name] = { # pyright: ignore[reportCallIssue,reportArgumentType]
135
+ metric.NAME: metric(
136
+ distrib_t=sample_hist, distrib_u=reference_hist
137
+ ).compute()
138
+ for metric in self.metrics
139
+ }
143
140
  self.context.logger.info("Finished computing the metrics")
144
141
 
145
142
  return metrics_per_feature
@@ -147,37 +144,37 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
147
144
  def _add_general_drift_result(
148
145
  self, results: list[ModelMonitoringApplicationResult], value: float
149
146
  ) -> None:
147
+ """Add the general drift result to the results list and log it"""
148
+ status = self._value_classifier.value_to_status(value)
150
149
  results.append(
151
150
  ModelMonitoringApplicationResult(
152
151
  name="general_drift",
153
152
  value=value,
154
153
  kind=self.METRIC_KIND,
155
- status=self._value_classifier.value_to_status(value),
154
+ status=status,
156
155
  )
157
156
  )
158
157
 
159
158
  def _get_results(
160
- self, metrics_per_feature: dict[type[HistogramDistanceMetric], list[float]]
159
+ self, metrics_per_feature: DataFrame
161
160
  ) -> list[ModelMonitoringApplicationResult]:
162
161
  """Average the metrics over the features and add the status"""
163
162
  results: list[ModelMonitoringApplicationResult] = []
164
- hellinger_tvd_values: list[float] = []
165
- for metric_class, metric_values in metrics_per_feature.items():
166
- self.context.logger.debug(
167
- "Averaging metric over the features", metric_name=metric_class.NAME
168
- )
169
- value = np.mean(metric_values)
170
- if metric_class == KullbackLeiblerDivergence:
163
+
164
+ self.context.logger.debug("Averaging metrics over the features")
165
+ metrics_mean = metrics_per_feature.mean().to_dict()
166
+
167
+ self.context.logger.debug("Creating the results")
168
+ for name, value in metrics_mean.items():
169
+ if name == KullbackLeiblerDivergence.NAME:
171
170
  # This metric is not bounded from above [0, inf).
172
171
  # No status is currently reported for KL divergence
173
172
  status = ResultStatusApp.irrelevant
174
173
  else:
175
174
  status = self._value_classifier.value_to_status(value)
176
- if metric_class in self._REQUIRED_METRICS:
177
- hellinger_tvd_values.append(value)
178
175
  results.append(
179
176
  ModelMonitoringApplicationResult(
180
- name=f"{metric_class.NAME}_mean",
177
+ name=f"{name}_mean",
181
178
  value=value,
182
179
  kind=self.METRIC_KIND,
183
180
  status=status,
@@ -185,16 +182,102 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
185
182
  )
186
183
 
187
184
  self._add_general_drift_result(
188
- results=results, value=np.mean(hellinger_tvd_values)
185
+ results=results,
186
+ value=np.mean(
187
+ [
188
+ metrics_mean[HellingerDistance.NAME],
189
+ metrics_mean[TotalVarianceDistance.NAME],
190
+ ]
191
+ ),
189
192
  )
190
193
 
194
+ self.context.logger.info("Finished with the results")
191
195
  return results
192
196
 
197
+ @staticmethod
198
+ def _remove_timestamp_feature(
199
+ sample_set_statistics: mlrun.common.model_monitoring.helpers.FeatureStats,
200
+ ) -> mlrun.common.model_monitoring.helpers.FeatureStats:
201
+ """
202
+ Drop the 'timestamp' feature if it exists, as it is irrelevant
203
+ in the plotly artifact
204
+ """
205
+ sample_set_statistics = mlrun.common.model_monitoring.helpers.FeatureStats(
206
+ sample_set_statistics.copy()
207
+ )
208
+ if EventFieldType.TIMESTAMP in sample_set_statistics:
209
+ del sample_set_statistics[EventFieldType.TIMESTAMP]
210
+ return sample_set_statistics
211
+
212
+ def _log_json_artifact(self, drift_per_feature_values: Series) -> None:
213
+ """Log the drift values as a JSON artifact"""
214
+ self.context.logger.debug("Logging drift value per feature JSON artifact")
215
+ self.context.log_artifact(
216
+ mlrun.artifacts.Artifact(
217
+ body=drift_per_feature_values.to_json(),
218
+ format="json",
219
+ key="features_drift_results",
220
+ )
221
+ )
222
+ self.context.logger.debug("Logged JSON artifact successfully")
223
+
224
+ def _log_plotly_table_artifact(
225
+ self,
226
+ sample_set_statistics: mlrun.common.model_monitoring.helpers.FeatureStats,
227
+ inputs_statistics: mlrun.common.model_monitoring.helpers.FeatureStats,
228
+ metrics_per_feature: DataFrame,
229
+ drift_per_feature_values: Series,
230
+ ) -> None:
231
+ """Log the Plotly drift table artifact"""
232
+ self.context.logger.debug(
233
+ "Feature stats",
234
+ sample_set_statistics=sample_set_statistics,
235
+ inputs_statistics=inputs_statistics,
236
+ )
237
+
238
+ self.context.logger.debug("Computing drift results per feature")
239
+ drift_results = {
240
+ cast(str, key): (self._value_classifier.value_to_status(value), value)
241
+ for key, value in drift_per_feature_values.items()
242
+ }
243
+ self.context.logger.debug("Logging plotly artifact")
244
+ self.context.log_artifact(
245
+ mm_drift_table.FeaturesDriftTablePlot().produce(
246
+ sample_set_statistics=sample_set_statistics,
247
+ inputs_statistics=inputs_statistics,
248
+ metrics=metrics_per_feature.T.to_dict(),
249
+ drift_results=drift_results,
250
+ )
251
+ )
252
+ self.context.logger.debug("Logged plotly artifact successfully")
253
+
254
+ def _log_drift_artifacts(
255
+ self,
256
+ sample_set_statistics: mlrun.common.model_monitoring.helpers.FeatureStats,
257
+ inputs_statistics: mlrun.common.model_monitoring.helpers.FeatureStats,
258
+ metrics_per_feature: DataFrame,
259
+ log_json_artifact: bool = True,
260
+ ) -> None:
261
+ """Log JSON and Plotly drift data per feature artifacts"""
262
+ drift_per_feature_values = metrics_per_feature[
263
+ [HellingerDistance.NAME, TotalVarianceDistance.NAME]
264
+ ].mean(axis=1)
265
+
266
+ if log_json_artifact:
267
+ self._log_json_artifact(drift_per_feature_values)
268
+
269
+ self._log_plotly_table_artifact(
270
+ sample_set_statistics=self._remove_timestamp_feature(sample_set_statistics),
271
+ inputs_statistics=inputs_statistics,
272
+ metrics_per_feature=metrics_per_feature,
273
+ drift_per_feature_values=drift_per_feature_values,
274
+ )
275
+
193
276
  def do_tracking(
194
277
  self,
195
278
  application_name: str,
196
- sample_df_stats: DataFrame,
197
- feature_stats: DataFrame,
279
+ sample_df_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
280
+ feature_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
198
281
  sample_df: DataFrame,
199
282
  start_infer_time: Timestamp,
200
283
  end_infer_time: Timestamp,
@@ -210,7 +293,14 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
210
293
  """
211
294
  self.context.logger.debug("Starting to run the application")
212
295
  metrics_per_feature = self._compute_metrics_per_feature(
213
- sample_df_stats=sample_df_stats, feature_stats=feature_stats
296
+ sample_df_stats=self.dict_to_histogram(sample_df_stats),
297
+ feature_stats=self.dict_to_histogram(feature_stats),
298
+ )
299
+ self.context.logger.debug("Saving artifacts")
300
+ self._log_drift_artifacts(
301
+ inputs_statistics=feature_stats,
302
+ sample_set_statistics=sample_df_stats,
303
+ metrics_per_feature=metrics_per_feature,
214
304
  )
215
305
  self.context.logger.debug("Computing average per metric")
216
306
  results = self._get_results(metrics_per_feature)
@@ -33,6 +33,7 @@ import mlrun.common.schemas.model_monitoring
33
33
  import mlrun.data_types.infer
34
34
  import mlrun.feature_store as fstore
35
35
  import mlrun.utils.v3io_clients
36
+ from mlrun.model_monitoring.helpers import calculate_inputs_statistics
36
37
  from mlrun.model_monitoring.metrics.histogram_distance import (
37
38
  HellingerDistance,
38
39
  HistogramDistanceMetric,
@@ -353,48 +354,6 @@ class VirtualDrift:
353
354
  return drift_status
354
355
 
355
356
 
356
- def calculate_inputs_statistics(
357
- sample_set_statistics: dict, inputs: pd.DataFrame
358
- ) -> dict:
359
- """
360
- Calculate the inputs data statistics for drift monitoring purpose.
361
-
362
- :param sample_set_statistics: The sample set (stored end point's dataset to reference) statistics. The bins of the
363
- histograms of each feature will be used to recalculate the histograms of the inputs.
364
- :param inputs: The inputs to calculate their statistics and later on - the drift with respect to the
365
- sample set.
366
-
367
- :returns: The calculated statistics of the inputs data.
368
- """
369
-
370
- # Use `DFDataInfer` to calculate the statistics over the inputs:
371
- inputs_statistics = mlrun.data_types.infer.DFDataInfer.get_stats(
372
- df=inputs,
373
- options=mlrun.data_types.infer.InferOptions.Histogram,
374
- )
375
-
376
- # Recalculate the histograms over the bins that are set in the sample-set of the end point:
377
- for feature in inputs_statistics.keys():
378
- if feature in sample_set_statistics:
379
- counts, bins = np.histogram(
380
- inputs[feature].to_numpy(),
381
- bins=sample_set_statistics[feature]["hist"][1],
382
- )
383
- inputs_statistics[feature]["hist"] = [
384
- counts.tolist(),
385
- bins.tolist(),
386
- ]
387
- elif "hist" in inputs_statistics[feature]:
388
- # Comply with the other common features' histogram length
389
- mlrun.common.model_monitoring.helpers.pad_hist(
390
- mlrun.common.model_monitoring.helpers.Histogram(
391
- inputs_statistics[feature]["hist"]
392
- )
393
- )
394
-
395
- return inputs_statistics
396
-
397
-
398
357
  class BatchProcessor:
399
358
  """
400
359
  The main object to handle the batch processing job. This object is used to get the required configurations and
@@ -31,10 +31,10 @@ from mlrun.common.model_monitoring.helpers import FeatureStats, pad_features_his
31
31
  from mlrun.datastore import get_stream_pusher
32
32
  from mlrun.datastore.targets import ParquetTarget
33
33
  from mlrun.errors import err_to_str
34
- from mlrun.model_monitoring.batch import calculate_inputs_statistics
35
34
  from mlrun.model_monitoring.helpers import (
36
35
  _BatchDict,
37
36
  batch_dict2timedelta,
37
+ calculate_inputs_statistics,
38
38
  get_monitoring_parquet_path,
39
39
  get_stream_path,
40
40
  )
@@ -445,13 +445,6 @@ class MonitoringApplicationController:
445
445
  m_fs = fstore.get_feature_set(
446
446
  endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
447
447
  )
448
- labels = endpoint[mm_constants.EventFieldType.LABEL_NAMES]
449
- if labels:
450
- if isinstance(labels, str):
451
- labels = json.loads(labels)
452
- for label in labels:
453
- if label not in list(m_fs.spec.features.keys()):
454
- m_fs.add_feature(fstore.Feature(name=label, value_type="float"))
455
448
 
456
449
  for application in applications_names:
457
450
  batch_window = batch_window_generator.get_batch_window(