PyPI - mlrun - Versions diffs - 1.7.0rc5__py3-none-any.whl → 1.7.0rc6__py3-none-any.whl - Mend

mlrun 1.7.0rc5py3-none-any.whl → 1.7.0rc6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (36) hide show

mlrun/artifacts/base.py +2 -1
mlrun/artifacts/plots.py +9 -5
mlrun/config.py +8 -2
mlrun/datastore/sources.py +5 -4
mlrun/db/factory.py +1 -1
mlrun/launcher/__init__.py +1 -1
mlrun/launcher/base.py +1 -1
mlrun/launcher/client.py +1 -1
mlrun/launcher/factory.py +1 -1
mlrun/launcher/local.py +1 -1
mlrun/launcher/remote.py +1 -1
mlrun/model_monitoring/api.py +6 -12
mlrun/model_monitoring/application.py +21 -21
mlrun/model_monitoring/applications/histogram_data_drift.py +130 -40
mlrun/model_monitoring/batch.py +1 -42
mlrun/model_monitoring/controller.py +1 -8
mlrun/model_monitoring/features_drift_table.py +34 -22
mlrun/model_monitoring/helpers.py +45 -4
mlrun/model_monitoring/stream_processing.py +2 -0
mlrun/projects/project.py +170 -16
mlrun/run.py +69 -73
mlrun/runtimes/__init__.py +35 -0
mlrun/runtimes/base.py +1 -1
mlrun/runtimes/nuclio/application/__init__.py +15 -0
mlrun/runtimes/nuclio/application/application.py +283 -0
mlrun/runtimes/nuclio/application/reverse_proxy.go +87 -0
mlrun/runtimes/nuclio/function.py +50 -1
mlrun/runtimes/pod.py +1 -1
mlrun/serving/states.py +7 -19
mlrun/utils/version/version.json +2 -2
{mlrun-1.7.0rc5.dist-info → mlrun-1.7.0rc6.dist-info}/METADATA +1 -1
{mlrun-1.7.0rc5.dist-info → mlrun-1.7.0rc6.dist-info}/RECORD +36 -33
{mlrun-1.7.0rc5.dist-info → mlrun-1.7.0rc6.dist-info}/LICENSE +0 -0
{mlrun-1.7.0rc5.dist-info → mlrun-1.7.0rc6.dist-info}/WHEEL +0 -0
{mlrun-1.7.0rc5.dist-info → mlrun-1.7.0rc6.dist-info}/entry_points.txt +0 -0
{mlrun-1.7.0rc5.dist-info → mlrun-1.7.0rc6.dist-info}/top_level.txt +0 -0

mlrun/artifacts/base.py CHANGED Viewed

@@ -88,9 +88,10 @@ class ArtifactSpec(ModelObj):
         "db_key",
         "extra_data",
         "unpackaging_instructions",
+        "producer",
     ]
-    _extra_fields = ["annotations", "producer", "sources", "license", "encoding"]
+    _extra_fields = ["annotations", "sources", "license", "encoding"]
     _exclude_fields_from_uid_hash = [
         # if the artifact is first created, it will not have a db_key,
         # exclude it so further updates of the artifacts will have the same hash

mlrun/artifacts/plots.py CHANGED Viewed

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import base64
+import typing
 from io import BytesIO
 from deprecated import deprecated
@@ -21,6 +22,9 @@ import mlrun
 from ..utils import dict_to_json
 from .base import Artifact, LegacyArtifact
+if typing.TYPE_CHECKING:
+    from plotly.graph_objs import Figure
 class PlotArtifact(Artifact):
     kind = "plot"
@@ -207,10 +211,10 @@ class PlotlyArtifact(Artifact):
     def __init__(
         self,
-        figure=None,
-        key: str = None,
-        target_path: str = None,
-    ):
+        figure: typing.Optional["Figure"] = None,
+        key: typing.Optional[str] = None,
+        target_path: typing.Optional[str] = None,
+    ) -> None:
         """
         Initialize a Plotly artifact with the given figure.
@@ -247,7 +251,7 @@ class PlotlyArtifact(Artifact):
         self._figure = figure
         self.spec.format = "html"
-    def get_body(self):
+    def get_body(self) -> str:
         """
         Get the artifact's body - the Plotly figure's html code.

mlrun/config.py CHANGED Viewed

@@ -324,7 +324,13 @@ default_config = {
                 # optional values (as per https://dev.mysql.com/doc/refman/8.0/en/sql-mode.html#sql-mode-full):
                 #
                 # if set to "nil" or "none", nothing would be set
-                "modes": "STRICT_TRANS_TABLES",
+                "modes": (
+                    "STRICT_TRANS_TABLES"
+                    ",NO_ZERO_IN_DATE"
+                    ",NO_ZERO_DATE"
+                    ",ERROR_FOR_DIVISION_BY_ZERO"
+                    ",NO_ENGINE_SUBSTITUTION",
+                )
             },
         },
         "jobs": {
@@ -443,7 +449,7 @@ default_config = {
             # pip install <requirement_specifier>, e.g. mlrun==0.5.4, mlrun~=0.5,
             # git+https://github.com/mlrun/mlrun@development. by default uses the version
             "mlrun_version_specifier": "",
-            "kaniko_image": "gcr.io/kaniko-project/executor:v1.8.0",  # kaniko builder image
+            "kaniko_image": "gcr.io/kaniko-project/executor:v1.21.1",  # kaniko builder image
             "kaniko_init_container_image": "alpine:3.18",
             # image for kaniko init container when docker registry is ECR
             "kaniko_aws_cli_image": "amazon/aws-cli:2.7.10",

mlrun/datastore/sources.py CHANGED Viewed

@@ -204,11 +204,11 @@ class CSVSource(BaseSourceDriver):
         )
     def get_spark_options(self):
-        store, path, url = mlrun.store_manager.get_or_create_store(self.path)
+        store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
         spark_options = store.get_spark_options()
         spark_options.update(
             {
-                "path": url,
+                "path": store.spark_url + path,
                 "format": "csv",
                 "header": "true",
                 "inferSchema": "true",
@@ -357,7 +357,7 @@ class ParquetSource(BaseSourceDriver):
         )
     def get_spark_options(self):
-        store, path, url = mlrun.store_manager.get_or_create_store(self.path)
+        store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
         spark_options = store.get_spark_options()
         spark_options.update(
             {
@@ -794,7 +794,8 @@ class OnlineSource(BaseSourceDriver):
         explicit_ack = (
             is_explicit_ack_supported(context) and mlrun.mlconf.is_explicit_ack()
         )
-        src_class = storey.AsyncEmitSource(
+        # TODO: Change to AsyncEmitSource once we can drop support for nuclio<1.12.10
+        src_class = storey.SyncEmitSource(
             context=context,
             key_field=self.key_field or key_field,
             full_event=True,

mlrun/db/factory.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2023 MLRun Authors
+# Copyright 2023 Iguazio
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

mlrun/launcher/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2023 MLRun Authors
+# Copyright 2023 Iguazio
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

mlrun/launcher/base.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2023 MLRun Authors
+# Copyright 2023 Iguazio
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

mlrun/launcher/client.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2023 MLRun Authors
+# Copyright 2023 Iguazio
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

mlrun/launcher/factory.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2023 MLRun Authors
+# Copyright 2023 Iguazio
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

mlrun/launcher/local.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2023 MLRun Authors
+# Copyright 2023 Iguazio
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

mlrun/launcher/remote.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2023 MLRun Authors
+# Copyright 2023 Iguazio
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

mlrun/model_monitoring/api.py CHANGED Viewed

@@ -704,8 +704,8 @@ def perform_drift_analysis(
         drift_detected_threshold=drift_threshold,
     )
-    # Drift table plot
-    html_plot = FeaturesDriftTablePlot().produce(
+    # Drift table artifact
+    plotly_artifact = FeaturesDriftTablePlot().produce(
         sample_set_statistics=sample_set_statistics,
         inputs_statistics=inputs_statistics,
         metrics=metrics,
@@ -732,7 +732,7 @@ def perform_drift_analysis(
     # Log the different artifacts
     _log_drift_artifacts(
         context=context,
-        html_plot=html_plot,
+        plotly_artifact=plotly_artifact,
         metrics_per_feature=metrics_per_feature,
         drift_status=drift_status,
         drift_metric=drift_metric,
@@ -742,7 +742,7 @@ def perform_drift_analysis(
 def _log_drift_artifacts(
     context: mlrun.MLClientCtx,
-    html_plot: str,
+    plotly_artifact: mlrun.artifacts.Artifact,
     metrics_per_feature: dict[str, float],
     drift_status: bool,
     drift_metric: float,
@@ -755,20 +755,14 @@ def _log_drift_artifacts(
     3 - Results of the total drift analysis
     :param context:             MLRun context. Will log the artifacts.
-    :param html_plot:           Body of the html file of the plot.
+    :param plotly_artifact:     The plotly artifact.
     :param metrics_per_feature: Dictionary in which the key is a feature name and the value is the drift numerical
                                 result.
     :param drift_status:        Boolean value that represents the final drift analysis result.
     :param drift_metric:        The final drift numerical result.
     :param artifacts_tag:       Tag to use for all the artifacts resulted from the function.
     """
-    context.log_artifact(
-        mlrun.artifacts.Artifact(
-            body=html_plot.encode("utf-8"), format="html", key="drift_table_plot"
-        ),
-        tag=artifacts_tag,
-    )
+    context.log_artifact(plotly_artifact, tag=artifacts_tag)
     context.log_artifact(
         mlrun.artifacts.Artifact(
             body=json.dumps(metrics_per_feature),

mlrun/model_monitoring/application.py CHANGED Viewed

@@ -16,13 +16,13 @@ import dataclasses
 import json
 import re
 from abc import ABC, abstractmethod
-from typing import Any, Optional, Union
+from typing import Any, Optional, Union, cast
 import numpy as np
 import pandas as pd
 import mlrun.common.helpers
-import mlrun.common.schemas.model_monitoring
+import mlrun.common.model_monitoring.helpers
 import mlrun.common.schemas.model_monitoring.constants as mm_constant
 import mlrun.utils.v3io_clients
 from mlrun.datastore import get_stream_pusher
@@ -84,8 +84,8 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
         class MyApp(ApplicationBase):
             def do_tracking(
                 self,
-                sample_df_stats: pd.DataFrame,
-                feature_stats: pd.DataFrame,
+                sample_df_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
+                feature_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
                 start_infer_time: pd.Timestamp,
                 end_infer_time: pd.Timestamp,
                 schedule_time: pd.Timestamp,
@@ -93,7 +93,7 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
                 endpoint_id: str,
                 output_stream_uri: str,
             ) -> ModelMonitoringApplicationResult:
-                self.context.log_artifact(TableArtifact("sample_df_stats", df=sample_df_stats))
+                self.context.log_artifact(TableArtifact("sample_df_stats", df=self.dict_to_histogram(sample_df_stats)))
                 return ModelMonitoringApplicationResult(
                     name="data_drift_test",
                     value=0.5,
@@ -126,14 +126,16 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
         return results, event
     def _lazy_init(self, app_name: str):
-        self.context = self._create_context_for_logging(app_name=app_name)
+        self.context = cast(
+            mlrun.MLClientCtx, self._create_context_for_logging(app_name=app_name)
+        )
     @abstractmethod
     def do_tracking(
         self,
         application_name: str,
-        sample_df_stats: pd.DataFrame,
-        feature_stats: pd.DataFrame,
+        sample_df_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
+        feature_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
         sample_df: pd.DataFrame,
         start_infer_time: pd.Timestamp,
         end_infer_time: pd.Timestamp,
@@ -147,8 +149,8 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
         Implement this method with your custom monitoring logic.
         :param application_name:         (str) the app name
-        :param sample_df_stats:         (pd.DataFrame) The new sample distribution DataFrame.
-        :param feature_stats:           (pd.DataFrame) The train sample distribution DataFrame.
+        :param sample_df_stats:         (FeatureStats) The new sample distribution dictionary.
+        :param feature_stats:           (FeatureStats) The train sample distribution dictionary.
         :param sample_df:               (pd.DataFrame) The new sample DataFrame.
         :param start_infer_time:        (pd.Timestamp) Start time of the monitoring schedule.
         :param end_infer_time:          (pd.Timestamp) End time of the monitoring schedule.
@@ -167,8 +169,8 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
         event: dict[str, Any],
     ) -> tuple[
         str,
-        pd.DataFrame,
-        pd.DataFrame,
+        mlrun.common.model_monitoring.helpers.FeatureStats,
+        mlrun.common.model_monitoring.helpers.FeatureStats,
         pd.DataFrame,
         pd.Timestamp,
         pd.Timestamp,
@@ -184,8 +186,8 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
         :return: A tuple of:
                      [0] = (str) application name
-                     [1] = (pd.DataFrame) current input statistics
-                     [2] = (pd.DataFrame) train statistics
+                     [1] = (dict) current input statistics
+                     [2] = (dict) train statistics
                      [3] = (pd.DataFrame) current input data
                      [4] = (pd.Timestamp) start time of the monitoring schedule
                      [5] = (pd.Timestamp) end time of the monitoring schedule
@@ -197,12 +199,8 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
         end_time = pd.Timestamp(event[mm_constant.ApplicationEvent.END_INFER_TIME])
         return (
             event[mm_constant.ApplicationEvent.APPLICATION_NAME],
-            cls._dict_to_histogram(
-                json.loads(event[mm_constant.ApplicationEvent.CURRENT_STATS])
-            ),
-            cls._dict_to_histogram(
-                json.loads(event[mm_constant.ApplicationEvent.FEATURE_STATS])
-            ),
+            json.loads(event[mm_constant.ApplicationEvent.CURRENT_STATS]),
+            json.loads(event[mm_constant.ApplicationEvent.FEATURE_STATS]),
             ParquetTarget(
                 path=event[mm_constant.ApplicationEvent.SAMPLE_PARQUET_PATH]
             ).as_df(start_time=start_time, end_time=end_time, time_column="timestamp"),
@@ -223,7 +221,9 @@ class ModelMonitoringApplicationBase(StepToDict, ABC):
         return context
     @staticmethod
-    def _dict_to_histogram(histogram_dict: dict[str, dict[str, Any]]) -> pd.DataFrame:
+    def dict_to_histogram(
+        histogram_dict: mlrun.common.model_monitoring.helpers.FeatureStats,
+    ) -> pd.DataFrame:
         """
         Convert histogram dictionary to pandas DataFrame with feature histograms as columns

mlrun/model_monitoring/applications/histogram_data_drift.py CHANGED Viewed

@@ -13,13 +13,17 @@
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Final, Optional, Protocol
+from typing import Final, Optional, Protocol, cast
 import numpy as np
-from pandas import DataFrame, Timestamp
+from pandas import DataFrame, Series, Timestamp
+import mlrun.artifacts
+import mlrun.common.model_monitoring.helpers
+import mlrun.model_monitoring.features_drift_table as mm_drift_table
 from mlrun.common.schemas.model_monitoring.constants import (
     MLRUN_HISTOGRAM_DATA_DRIFT_APP_NAME,
+    EventFieldType,
     ResultKindApp,
     ResultStatusApp,
 )
@@ -27,7 +31,7 @@ from mlrun.model_monitoring.application import (
     ModelMonitoringApplicationBase,
     ModelMonitoringApplicationResult,
 )
-from mlrun.model_monitoring.batch import (
+from mlrun.model_monitoring.metrics.histogram_distance import (
     HellingerDistance,
     HistogramDistanceMetric,
     KullbackLeiblerDivergence,
@@ -115,31 +119,24 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
     def _compute_metrics_per_feature(
         self, sample_df_stats: DataFrame, feature_stats: DataFrame
-    ) -> dict[type[HistogramDistanceMetric], list[float]]:
+    ) -> DataFrame:
         """Compute the metrics for the different features and labels"""
-        metrics_per_feature: dict[type[HistogramDistanceMetric], list[float]] = {
-            metric_class: [] for metric_class in self.metrics
-        }
+        metrics_per_feature = DataFrame(
+            columns=[metric_class.NAME for metric_class in self.metrics]
+        )
-        for (sample_feat, sample_hist), (reference_feat, reference_hist) in zip(
-            sample_df_stats.items(), feature_stats.items()
-        ):
-            assert sample_feat == reference_feat, "The features do not match"
+        for feature_name in feature_stats:
+            sample_hist = np.asarray(sample_df_stats[feature_name])
+            reference_hist = np.asarray(feature_stats[feature_name])
             self.context.logger.info(
-                "Computing metrics for feature", feature_name=sample_feat
+                "Computing metrics for feature", feature_name=feature_name
             )
-            sample_arr = np.asarray(sample_hist)
-            reference_arr = np.asarray(reference_hist)
-            for metric in self.metrics:
-                metric_name = metric.NAME
-                self.context.logger.debug(
-                    "Computing data drift metric",
-                    metric_name=metric_name,
-                    feature_name=sample_feat,
-                )
-                metrics_per_feature[metric].append(
-                    metric(distrib_t=sample_arr, distrib_u=reference_arr).compute()
-                )
+            metrics_per_feature.loc[feature_name] = {  # pyright: ignore[reportCallIssue,reportArgumentType]
+                metric.NAME: metric(
+                    distrib_t=sample_hist, distrib_u=reference_hist
+                ).compute()
+                for metric in self.metrics
+            }
         self.context.logger.info("Finished computing the metrics")
         return metrics_per_feature
@@ -147,37 +144,37 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
     def _add_general_drift_result(
         self, results: list[ModelMonitoringApplicationResult], value: float
     ) -> None:
+        """Add the general drift result to the results list and log it"""
+        status = self._value_classifier.value_to_status(value)
         results.append(
             ModelMonitoringApplicationResult(
                 name="general_drift",
                 value=value,
                 kind=self.METRIC_KIND,
-                status=self._value_classifier.value_to_status(value),
+                status=status,
             )
         )
     def _get_results(
-        self, metrics_per_feature: dict[type[HistogramDistanceMetric], list[float]]
+        self, metrics_per_feature: DataFrame
     ) -> list[ModelMonitoringApplicationResult]:
         """Average the metrics over the features and add the status"""
         results: list[ModelMonitoringApplicationResult] = []
-        hellinger_tvd_values: list[float] = []
-        for metric_class, metric_values in metrics_per_feature.items():
-            self.context.logger.debug(
-                "Averaging metric over the features", metric_name=metric_class.NAME
-            )
-            value = np.mean(metric_values)
-            if metric_class == KullbackLeiblerDivergence:
+        self.context.logger.debug("Averaging metrics over the features")
+        metrics_mean = metrics_per_feature.mean().to_dict()
+        self.context.logger.debug("Creating the results")
+        for name, value in metrics_mean.items():
+            if name == KullbackLeiblerDivergence.NAME:
                 # This metric is not bounded from above [0, inf).
                 # No status is currently reported for KL divergence
                 status = ResultStatusApp.irrelevant
             else:
                 status = self._value_classifier.value_to_status(value)
-            if metric_class in self._REQUIRED_METRICS:
-                hellinger_tvd_values.append(value)
             results.append(
                 ModelMonitoringApplicationResult(
-                    name=f"{metric_class.NAME}_mean",
+                    name=f"{name}_mean",
                     value=value,
                     kind=self.METRIC_KIND,
                     status=status,
@@ -185,16 +182,102 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
             )
         self._add_general_drift_result(
-            results=results, value=np.mean(hellinger_tvd_values)
+            results=results,
+            value=np.mean(
+                [
+                    metrics_mean[HellingerDistance.NAME],
+                    metrics_mean[TotalVarianceDistance.NAME],
+                ]
+            ),
         )
+        self.context.logger.info("Finished with the results")
         return results
+    @staticmethod
+    def _remove_timestamp_feature(
+        sample_set_statistics: mlrun.common.model_monitoring.helpers.FeatureStats,
+    ) -> mlrun.common.model_monitoring.helpers.FeatureStats:
+        """
+        Drop the 'timestamp' feature if it exists, as it is irrelevant
+        in the plotly artifact
+        """
+        sample_set_statistics = mlrun.common.model_monitoring.helpers.FeatureStats(
+            sample_set_statistics.copy()
+        )
+        if EventFieldType.TIMESTAMP in sample_set_statistics:
+            del sample_set_statistics[EventFieldType.TIMESTAMP]
+        return sample_set_statistics
+    def _log_json_artifact(self, drift_per_feature_values: Series) -> None:
+        """Log the drift values as a JSON artifact"""
+        self.context.logger.debug("Logging drift value per feature JSON artifact")
+        self.context.log_artifact(
+            mlrun.artifacts.Artifact(
+                body=drift_per_feature_values.to_json(),
+                format="json",
+                key="features_drift_results",
+            )
+        )
+        self.context.logger.debug("Logged JSON artifact successfully")
+    def _log_plotly_table_artifact(
+        self,
+        sample_set_statistics: mlrun.common.model_monitoring.helpers.FeatureStats,
+        inputs_statistics: mlrun.common.model_monitoring.helpers.FeatureStats,
+        metrics_per_feature: DataFrame,
+        drift_per_feature_values: Series,
+    ) -> None:
+        """Log the Plotly drift table artifact"""
+        self.context.logger.debug(
+            "Feature stats",
+            sample_set_statistics=sample_set_statistics,
+            inputs_statistics=inputs_statistics,
+        )
+        self.context.logger.debug("Computing drift results per feature")
+        drift_results = {
+            cast(str, key): (self._value_classifier.value_to_status(value), value)
+            for key, value in drift_per_feature_values.items()
+        }
+        self.context.logger.debug("Logging plotly artifact")
+        self.context.log_artifact(
+            mm_drift_table.FeaturesDriftTablePlot().produce(
+                sample_set_statistics=sample_set_statistics,
+                inputs_statistics=inputs_statistics,
+                metrics=metrics_per_feature.T.to_dict(),
+                drift_results=drift_results,
+            )
+        )
+        self.context.logger.debug("Logged plotly artifact successfully")
+    def _log_drift_artifacts(
+        self,
+        sample_set_statistics: mlrun.common.model_monitoring.helpers.FeatureStats,
+        inputs_statistics: mlrun.common.model_monitoring.helpers.FeatureStats,
+        metrics_per_feature: DataFrame,
+        log_json_artifact: bool = True,
+    ) -> None:
+        """Log JSON and Plotly drift data per feature artifacts"""
+        drift_per_feature_values = metrics_per_feature[
+            [HellingerDistance.NAME, TotalVarianceDistance.NAME]
+        ].mean(axis=1)
+        if log_json_artifact:
+            self._log_json_artifact(drift_per_feature_values)
+        self._log_plotly_table_artifact(
+            sample_set_statistics=self._remove_timestamp_feature(sample_set_statistics),
+            inputs_statistics=inputs_statistics,
+            metrics_per_feature=metrics_per_feature,
+            drift_per_feature_values=drift_per_feature_values,
+        )
     def do_tracking(
         self,
         application_name: str,
-        sample_df_stats: DataFrame,
-        feature_stats: DataFrame,
+        sample_df_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
+        feature_stats: mlrun.common.model_monitoring.helpers.FeatureStats,
         sample_df: DataFrame,
         start_infer_time: Timestamp,
         end_infer_time: Timestamp,
@@ -210,7 +293,14 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
         """
         self.context.logger.debug("Starting to run the application")
         metrics_per_feature = self._compute_metrics_per_feature(
-            sample_df_stats=sample_df_stats, feature_stats=feature_stats
+            sample_df_stats=self.dict_to_histogram(sample_df_stats),
+            feature_stats=self.dict_to_histogram(feature_stats),
+        )
+        self.context.logger.debug("Saving artifacts")
+        self._log_drift_artifacts(
+            inputs_statistics=feature_stats,
+            sample_set_statistics=sample_df_stats,
+            metrics_per_feature=metrics_per_feature,
         )
         self.context.logger.debug("Computing average per metric")
         results = self._get_results(metrics_per_feature)

mlrun/model_monitoring/batch.py CHANGED Viewed

@@ -33,6 +33,7 @@ import mlrun.common.schemas.model_monitoring
 import mlrun.data_types.infer
 import mlrun.feature_store as fstore
 import mlrun.utils.v3io_clients
+from mlrun.model_monitoring.helpers import calculate_inputs_statistics
 from mlrun.model_monitoring.metrics.histogram_distance import (
     HellingerDistance,
     HistogramDistanceMetric,
@@ -353,48 +354,6 @@ class VirtualDrift:
         return drift_status
-def calculate_inputs_statistics(
-    sample_set_statistics: dict, inputs: pd.DataFrame
-) -> dict:
-    """
-    Calculate the inputs data statistics for drift monitoring purpose.
-    :param sample_set_statistics: The sample set (stored end point's dataset to reference) statistics. The bins of the
-                                  histograms of each feature will be used to recalculate the histograms of the inputs.
-    :param inputs:                The inputs to calculate their statistics and later on - the drift with respect to the
-                                  sample set.
-    :returns: The calculated statistics of the inputs data.
-    """
-    # Use `DFDataInfer` to calculate the statistics over the inputs:
-    inputs_statistics = mlrun.data_types.infer.DFDataInfer.get_stats(
-        df=inputs,
-        options=mlrun.data_types.infer.InferOptions.Histogram,
-    )
-    # Recalculate the histograms over the bins that are set in the sample-set of the end point:
-    for feature in inputs_statistics.keys():
-        if feature in sample_set_statistics:
-            counts, bins = np.histogram(
-                inputs[feature].to_numpy(),
-                bins=sample_set_statistics[feature]["hist"][1],
-            )
-            inputs_statistics[feature]["hist"] = [
-                counts.tolist(),
-                bins.tolist(),
-            ]
-        elif "hist" in inputs_statistics[feature]:
-            # Comply with the other common features' histogram length
-            mlrun.common.model_monitoring.helpers.pad_hist(
-                mlrun.common.model_monitoring.helpers.Histogram(
-                    inputs_statistics[feature]["hist"]
-                )
-            )
-    return inputs_statistics
 class BatchProcessor:
     """
     The main object to handle the batch processing job. This object is used to get the required configurations and

mlrun/model_monitoring/controller.py CHANGED Viewed

@@ -31,10 +31,10 @@ from mlrun.common.model_monitoring.helpers import FeatureStats, pad_features_his
 from mlrun.datastore import get_stream_pusher
 from mlrun.datastore.targets import ParquetTarget
 from mlrun.errors import err_to_str
-from mlrun.model_monitoring.batch import calculate_inputs_statistics
 from mlrun.model_monitoring.helpers import (
     _BatchDict,
     batch_dict2timedelta,
+    calculate_inputs_statistics,
     get_monitoring_parquet_path,
     get_stream_path,
 )
@@ -445,13 +445,6 @@ class MonitoringApplicationController:
             m_fs = fstore.get_feature_set(
                 endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
             )
-            labels = endpoint[mm_constants.EventFieldType.LABEL_NAMES]
-            if labels:
-                if isinstance(labels, str):
-                    labels = json.loads(labels)
-                for label in labels:
-                    if label not in list(m_fs.spec.features.keys()):
-                        m_fs.add_feature(fstore.Feature(name=label, value_type="float"))
             for application in applications_names:
                 batch_window = batch_window_generator.get_batch_window(

mlrun 1.7.0rc5__py3-none-any.whl → 1.7.0rc6__py3-none-any.whl

Potentially problematic release.

mlrun 1.7.0rc5py3-none-any.whl → 1.7.0rc6py3-none-any.whl