PyPI - mlrun - Versions diffs - 1.8.0rc45__py3-none-any.whl → 1.8.0rc47__py3-none-any.whl - Mend

mlrun 1.8.0rc45py3-none-any.whl → 1.8.0rc47py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (27) hide show

mlrun/alerts/alert.py +1 -1
mlrun/common/schemas/model_monitoring/constants.py +5 -0
mlrun/config.py +2 -0
mlrun/data_types/__init__.py +5 -1
mlrun/datastore/targets.py +7 -5
mlrun/model_monitoring/api.py +31 -18
mlrun/model_monitoring/applications/context.py +14 -1
mlrun/model_monitoring/applications/evidently/base.py +38 -0
mlrun/model_monitoring/controller.py +208 -84
mlrun/model_monitoring/db/_schedules.py +110 -32
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +6 -1
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +11 -5
mlrun/model_monitoring/helpers.py +46 -53
mlrun/projects/project.py +29 -24
mlrun/runtimes/function_reference.py +3 -0
mlrun/runtimes/nuclio/function.py +48 -0
mlrun/runtimes/nuclio/serving.py +16 -1
mlrun/serving/states.py +48 -27
mlrun/serving/v2_serving.py +51 -1
mlrun/utils/helpers.py +5 -2
mlrun/utils/version/version.json +2 -2
{mlrun-1.8.0rc45.dist-info → mlrun-1.8.0rc47.dist-info}/METADATA +5 -5
{mlrun-1.8.0rc45.dist-info → mlrun-1.8.0rc47.dist-info}/RECORD +27 -27
{mlrun-1.8.0rc45.dist-info → mlrun-1.8.0rc47.dist-info}/WHEEL +1 -1
{mlrun-1.8.0rc45.dist-info → mlrun-1.8.0rc47.dist-info}/entry_points.txt +0 -0
{mlrun-1.8.0rc45.dist-info → mlrun-1.8.0rc47.dist-info}/licenses/LICENSE +0 -0
{mlrun-1.8.0rc45.dist-info → mlrun-1.8.0rc47.dist-info}/top_level.txt +0 -0

mlrun/model_monitoring/controller.py CHANGED Viewed

@@ -12,15 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import collections
 import concurrent.futures
 import datetime
 import json
 import os
 import traceback
+from collections import OrderedDict
 from collections.abc import Iterator
 from contextlib import AbstractContextManager
 from types import TracebackType
-from typing import Any, NamedTuple, Optional, cast
+from typing import Any, NamedTuple, Optional, Union, cast
 import nuclio_sdk
@@ -28,7 +30,9 @@ import mlrun
 import mlrun.common.schemas.model_monitoring.constants as mm_constants
 import mlrun.feature_store as fstore
 import mlrun.model_monitoring
+import mlrun.model_monitoring.db._schedules as schedules
 import mlrun.model_monitoring.helpers
+import mlrun.platforms.iguazio
 from mlrun.common.schemas import EndpointType
 from mlrun.common.schemas.model_monitoring.constants import (
     ControllerEvent,
@@ -36,7 +40,6 @@ from mlrun.common.schemas.model_monitoring.constants import (
     ControllerEventKind,
 )
 from mlrun.errors import err_to_str
-from mlrun.model_monitoring.db._schedules import ModelMonitoringSchedulesFile
 from mlrun.model_monitoring.helpers import batch_dict2timedelta
 from mlrun.utils import datetime_now, logger
@@ -53,7 +56,7 @@ class _BatchWindow:
     def __init__(
         self,
         *,
-        schedules_file: ModelMonitoringSchedulesFile,
+        schedules_file: schedules.ModelMonitoringSchedulesFileEndpoint,
         application: str,
         timedelta_seconds: int,
         last_updated: int,
@@ -153,7 +156,7 @@ class _BatchWindowGenerator(AbstractContextManager):
         self._project = project
         self._endpoint_id = endpoint_id
         self._timedelta = window_length
-        self._schedules_file = ModelMonitoringSchedulesFile(
+        self._schedules_file = schedules.ModelMonitoringSchedulesFileEndpoint(
             project=project, endpoint_id=endpoint_id
         )
@@ -243,7 +246,7 @@ class MonitoringApplicationController:
     Note that the MonitoringApplicationController object requires access keys along with valid project configurations.
     """
-    _MAX_OPEN_WINDOWS_ALLOWED = 5
+    _MAX_FEATURE_SET_PER_WORKER = 1000
     def __init__(self) -> None:
         """Initialize Monitoring Application Controller"""
@@ -259,6 +262,61 @@ class MonitoringApplicationController:
             mlrun.mlconf.artifact_path
         )
         self.storage_options = store.get_storage_options()
+        self._controller_stream: Optional[
+            Union[
+                mlrun.platforms.iguazio.OutputStream,
+                mlrun.platforms.iguazio.KafkaOutputStream,
+            ]
+        ] = None
+        self._model_monitoring_stream: Optional[
+            Union[
+                mlrun.platforms.iguazio.OutputStream,
+                mlrun.platforms.iguazio.KafkaOutputStream,
+            ]
+        ] = None
+        self.applications_streams: dict[
+            str,
+            Union[
+                mlrun.platforms.iguazio.OutputStream,
+                mlrun.platforms.iguazio.KafkaOutputStream,
+            ],
+        ] = {}
+        self.feature_sets: OrderedDict[str, mlrun.feature_store.FeatureSet] = (
+            collections.OrderedDict()
+        )
+        self.tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
+            project=self.project
+        )
+    @property
+    def controller_stream(
+        self,
+    ) -> Union[
+        mlrun.platforms.iguazio.OutputStream,
+        mlrun.platforms.iguazio.KafkaOutputStream,
+    ]:
+        if self._controller_stream is None:
+            self._controller_stream = mlrun.model_monitoring.helpers.get_output_stream(
+                project=self.project,
+                function_name=mm_constants.MonitoringFunctionNames.APPLICATION_CONTROLLER,
+                v3io_access_key=self.v3io_access_key,
+            )
+        return self._controller_stream
+    @property
+    def model_monitoring_stream(
+        self,
+    ) -> Union[
+        mlrun.platforms.iguazio.OutputStream,
+        mlrun.platforms.iguazio.KafkaOutputStream,
+    ]:
+        if self._model_monitoring_stream is None:
+            self._model_monitoring_stream = mlrun.model_monitoring.helpers.get_output_stream(
+                project=self.project,
+                function_name=mm_constants.MonitoringFunctionNames.APPLICATION_CONTROLLER,
+                v3io_access_key=self.v3io_access_key,
+            )
+        return self._model_monitoring_stream
     @staticmethod
     def _get_model_monitoring_access_key() -> Optional[str]:
@@ -273,6 +331,7 @@ class MonitoringApplicationController:
         endpoint: mlrun.common.schemas.ModelEndpoint,
         application_names: set,
         base_period_minutes: int,
+        schedules_file: schedules.ModelMonitoringSchedulesFileChief,
     ) -> bool:
         """
         checks if there is a need to monitor the given endpoint, we should monitor endpoint if it stands in the
@@ -281,11 +340,23 @@ class MonitoringApplicationController:
             2.  first request exists
             3.  last request exists
             4.  endpoint_type is not ROUTER
-        if the four above conditions apply we require one of the three conditions to monitor:
+        if the four above conditions apply we require one of the two condition monitor:
             1.  never monitored the one of the endpoint applications meaning min_last_analyzed is None
-            2.  last request has a higher timestamp than the min_last_analyzed timestamp
-            3.  We didn't analyze one of the application for over than _MAX_OPEN_WINDOWS_ALLOWED windows
+            2.  min_last_analyzed stands in the condition for sending NOP event and this the first time regular event
+            is sent with the combination of  current last_request  & current last_analyzed  per endpoint.
         """
+        last_timestamp_sent = schedules_file.get_endpoint_last_request(
+            endpoint.metadata.uid
+        )
+        last_analyzed_sent = schedules_file.get_endpoint_last_analyzed(
+            endpoint.metadata.uid
+        )
+        logger.debug(
+            "Chief should monitor endpoint check",
+            last_timestamp_sent=last_timestamp_sent,
+            last_analyzed_sent=last_analyzed_sent,
+            uid=endpoint.metadata.uid,
+        )
         if (
             # Is the model endpoint monitored?
             endpoint.status.monitoring_mode == mm_constants.ModelMonitoringMode.enabled
@@ -300,26 +371,43 @@ class MonitoringApplicationController:
                 project=endpoint.metadata.project,
                 endpoint_id=endpoint.metadata.uid,
             ) as batch_window_generator:
-                base_period_seconds = base_period_minutes * _SECONDS_IN_MINUTE
-                if application_names != batch_window_generator.get_application_list():
+                current_time = mlrun.utils.datetime_now()
+                current_min_last_analyzed = (
+                    batch_window_generator.get_min_last_analyzed()
+                )
+                if (
+                    # Different application names, or last analyzed never updated while there are application to monitor
+                    application_names
+                    and (
+                        application_names
+                        != batch_window_generator.get_application_list()
+                        or not current_min_last_analyzed
+                    )
+                ):
                     return True
                 elif (
-                    not batch_window_generator.get_min_last_analyzed()
-                    or batch_window_generator.get_min_last_analyzed()
-                    <= int(endpoint.status.last_request.timestamp())
-                    or mlrun.utils.datetime_now().timestamp()
-                    - batch_window_generator.get_min_last_analyzed()
-                    >= self._MAX_OPEN_WINDOWS_ALLOWED * base_period_seconds
+                    # Does nop event will be sent to close the relevant window
+                    self._should_send_nop_event(
+                        base_period_minutes, current_min_last_analyzed, current_time
+                    )
+                    and (
+                        int(endpoint.status.last_request.timestamp())
+                        != last_timestamp_sent
+                        or current_min_last_analyzed != last_analyzed_sent
+                    )
                 ):
+                    # Write to schedule chief file the last_request, min_last_analyzed we pushed event to stream
+                    schedules_file.update_endpoint_timestamps(
+                        endpoint_uid=endpoint.metadata.uid,
+                        last_request=int(endpoint.status.last_request.timestamp()),
+                        last_analyzed=current_min_last_analyzed,
+                    )
                     return True
                 else:
                     logger.info(
                         "All the possible intervals were already analyzed, didn't push regular event",
                         endpoint_id=endpoint.metadata.uid,
-                        last_analyzed=datetime.datetime.fromtimestamp(
-                            batch_window_generator.get_min_last_analyzed(),
-                            tz=datetime.timezone.utc,
-                        ),
+                        last_analyzed=current_min_last_analyzed,
                         last_request=endpoint.status.last_request,
                     )
         else:
@@ -334,6 +422,21 @@ class MonitoringApplicationController:
             )
         return False
+    @staticmethod
+    def _should_send_nop_event(
+        base_period_minutes: int,
+        min_last_analyzed: int,
+        current_time: datetime.datetime,
+    ):
+        if min_last_analyzed:
+            return (
+                current_time.timestamp() - min_last_analyzed
+                >= datetime.timedelta(minutes=base_period_minutes).total_seconds()
+                + mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs
+            )
+        else:
+            return True
     def run(self, event: nuclio_sdk.Event) -> None:
         """
         Main method for controller chief, runs all the relevant monitoring applications for a single endpoint.
@@ -377,9 +480,9 @@ class MonitoringApplicationController:
             ]
             not_batch_endpoint = (
-                event[ControllerEvent.ENDPOINT_POLICY] != EndpointType.BATCH_EP
+                event[ControllerEvent.ENDPOINT_TYPE] != EndpointType.BATCH_EP
             )
-            m_fs = fstore.get_feature_set(event[ControllerEvent.FEATURE_SET_URI])
             logger.info(
                 "Starting analyzing for", timestamp=event[ControllerEvent.TIMESTAMP]
             )
@@ -404,13 +507,39 @@ class MonitoringApplicationController:
                         first_request=first_request,
                         last_request=last_stream_timestamp,
                     ):
-                        df = m_fs.to_dataframe(
-                            start_time=start_infer_time,
-                            end_time=end_infer_time,
-                            time_column=mm_constants.EventFieldType.TIMESTAMP,
-                            storage_options=self.storage_options,
-                        )
-                        if len(df) == 0:
+                        data_in_window = False
+                        if not_batch_endpoint:
+                            # Serving endpoint - get the relevant window data from the TSDB
+                            prediction_metric = self.tsdb_connector.read_predictions(
+                                start=start_infer_time,
+                                end=end_infer_time,
+                                endpoint_id=endpoint_id,
+                            )
+                            if prediction_metric.data:
+                                data_in_window = True
+                        else:
+                            if endpoint_id not in self.feature_sets:
+                                self.feature_sets[endpoint_id] = fstore.get_feature_set(
+                                    event[ControllerEvent.FEATURE_SET_URI]
+                                )
+                            self.feature_sets.move_to_end(endpoint_id, last=False)
+                            if (
+                                len(self.feature_sets)
+                                > self._MAX_FEATURE_SET_PER_WORKER
+                            ):
+                                self.feature_sets.popitem(last=True)
+                            m_fs = self.feature_sets.get(endpoint_id)
+                            # Batch endpoint - get the relevant window data from the parquet target
+                            df = m_fs.to_dataframe(
+                                start_time=start_infer_time,
+                                end_time=end_infer_time,
+                                time_column=mm_constants.EventFieldType.TIMESTAMP,
+                                storage_options=self.storage_options,
+                            )
+                            if len(df) > 0:
+                                data_in_window = True
+                        if not data_in_window:
                             logger.info(
                                 "No data found for the given interval",
                                 start=start_infer_time,
@@ -441,9 +570,11 @@ class MonitoringApplicationController:
                 ]
                 current_time = mlrun.utils.datetime_now()
                 if (
-                    current_time.timestamp()
-                    - batch_window_generator.get_min_last_analyzed()
-                    >= datetime.timedelta(minutes=base_period).total_seconds()
+                    self._should_send_nop_event(
+                        base_period,
+                        batch_window_generator.get_min_last_analyzed(),
+                        current_time,
+                    )
                     and event[ControllerEvent.KIND] != ControllerEventKind.NOP_EVENT
                 ):
                     event = {
@@ -481,8 +612,8 @@ class MonitoringApplicationController:
                 endpoint_id=event[ControllerEvent.ENDPOINT_ID],
             )
-    @staticmethod
     def _push_to_applications(
+        self,
         start_infer_time: datetime.datetime,
         end_infer_time: datetime.datetime,
         endpoint_id: str,
@@ -516,12 +647,15 @@ class MonitoringApplicationController:
         }
         for app_name in applications_names:
             data.update({mm_constants.ApplicationEvent.APPLICATION_NAME: app_name})
-            app_stream = mlrun.model_monitoring.helpers.get_output_stream(
-                project=project,
-                function_name=app_name,
-                v3io_access_key=model_monitoring_access_key,
-            )
+            if app_name not in self.applications_streams:
+                self.applications_streams[app_name] = (
+                    mlrun.model_monitoring.helpers.get_output_stream(
+                        project=project,
+                        function_name=app_name,
+                        v3io_access_key=model_monitoring_access_key,
+                    )
+                )
+            app_stream = self.applications_streams.get(app_name)
             logger.info(
                 "Pushing data to application stream",
@@ -534,7 +668,6 @@ class MonitoringApplicationController:
     def push_regular_event_to_controller_stream(self) -> None:
         """
         pushes a regular event to the controller stream.
-        :param event: the nuclio trigger event
         """
         logger.info("Starting monitoring controller chief")
         applications_names = []
@@ -581,29 +714,32 @@ class MonitoringApplicationController:
         with concurrent.futures.ThreadPoolExecutor(
             max_workers=min(len(endpoints), 10)
         ) as pool:
-            futures = {
-                pool.submit(
-                    self.endpoint_to_regular_event,
-                    endpoint,
-                    policy,
-                    set(applications_names),
-                    self.v3io_access_key,
-                ): endpoint
-                for endpoint in endpoints
-            }
-            for future in concurrent.futures.as_completed(futures):
-                if future.exception():
-                    exception = future.exception()
-                    error = (
-                        f"Failed to push event. Endpoint name: {futures[future].metadata.name}, "
-                        f"endpoint uid: {futures[future].metadata.uid}, traceback:\n"
-                    )
-                    error += "".join(
-                        traceback.format_exception(
-                            None, exception, exception.__traceback__
+            with schedules.ModelMonitoringSchedulesFileChief(
+                self.project
+            ) as schedule_file:
+                futures = {
+                    pool.submit(
+                        self.endpoint_to_regular_event,
+                        endpoint,
+                        policy,
+                        set(applications_names),
+                        schedule_file,
+                    ): endpoint
+                    for endpoint in endpoints
+                }
+                for future in concurrent.futures.as_completed(futures):
+                    if future.exception():
+                        exception = future.exception()
+                        error = (
+                            f"Failed to push event. Endpoint name: {futures[future].metadata.name}, "
+                            f"endpoint uid: {futures[future].metadata.uid}, traceback:\n"
                         )
-                    )
-                    logger.error(error)
+                        error += "".join(
+                            traceback.format_exception(
+                                None, exception, exception.__traceback__
+                            )
+                        )
+                        logger.error(error)
         logger.info("Finishing monitoring controller chief")
     def endpoint_to_regular_event(
@@ -611,15 +747,16 @@ class MonitoringApplicationController:
         endpoint: mlrun.common.schemas.ModelEndpoint,
         policy: dict,
         applications_names: set,
-        v3io_access_key: str,
+        schedule_file: schedules.ModelMonitoringSchedulesFileChief,
     ) -> None:
         if self._should_monitor_endpoint(
             endpoint,
             set(applications_names),
             policy.get(ControllerEventEndpointPolicy.BASE_PERIOD, 10),
+            schedule_file,
         ):
-            logger.info(
-                "Regular event is being pushed to controller stream for model endpoint",
+            logger.debug(
+                "Endpoint data is being prepared for regular event",
                 endpoint_id=endpoint.metadata.uid,
                 endpoint_name=endpoint.metadata.name,
                 timestamp=endpoint.status.last_request.isoformat(
@@ -635,12 +772,11 @@ class MonitoringApplicationController:
             policy[ControllerEventEndpointPolicy.ENDPOINT_UPDATED] = (
                 endpoint.metadata.updated.isoformat()
             )
-            MonitoringApplicationController.push_to_controller_stream(
+            self.push_to_controller_stream(
                 kind=mm_constants.ControllerEventKind.REGULAR_EVENT,
                 project=endpoint.metadata.project,
                 endpoint_id=endpoint.metadata.uid,
                 endpoint_name=endpoint.metadata.name,
-                stream_access_key=v3io_access_key,
                 timestamp=endpoint.status.last_request.isoformat(
                     sep=" ", timespec="microseconds"
                 ),
@@ -652,13 +788,12 @@ class MonitoringApplicationController:
                 endpoint_policy=policy,
             )
-    @staticmethod
     def push_to_controller_stream(
+        self,
         kind: str,
         project: str,
         endpoint_id: str,
         endpoint_name: str,
-        stream_access_key: str,
         timestamp: str,
         first_request: str,
         endpoint_type: int,
@@ -676,7 +811,6 @@ class MonitoringApplicationController:
         :param endpoint_name: the endpoint name string
         :param endpoint_type: Enum of the endpoint type
         :param feature_set_uri: the feature set uri string
-        :param stream_access_key: access key to apply the model monitoring process.
         """
         event = {
             ControllerEvent.KIND.value: kind,
@@ -689,18 +823,13 @@ class MonitoringApplicationController:
             ControllerEvent.FEATURE_SET_URI.value: feature_set_uri,
             ControllerEvent.ENDPOINT_POLICY.value: endpoint_policy,
         }
-        controller_stream = mlrun.model_monitoring.helpers.get_output_stream(
-            project=project,
-            function_name=mm_constants.MonitoringFunctionNames.APPLICATION_CONTROLLER,
-            v3io_access_key=stream_access_key,
-        )
         logger.info(
             "Pushing data to controller stream",
             event=event,
             endpoint_id=endpoint_id,
-            controller_stream_type=str(type(controller_stream)),
+            controller_stream_type=str(type(self.controller_stream)),
         )
-        controller_stream.push([event], partition_key=endpoint_id)
+        self.controller_stream.push([event], partition_key=endpoint_id)
     def _push_to_main_stream(self, event: dict, endpoint_id: str) -> None:
         """
@@ -708,18 +837,13 @@ class MonitoringApplicationController:
         :param event: event dictionary to push to stream
         :param endpoint_id: endpoint id string
         """
-        mm_stream = mlrun.model_monitoring.helpers.get_output_stream(
-            project=event.get(ControllerEvent.PROJECT),
-            function_name=mm_constants.MonitoringFunctionNames.APPLICATION_CONTROLLER,
-            v3io_access_key=self.v3io_access_key,
-        )
         logger.info(
             "Pushing data to main stream, NOP event is been generated",
             event=json.dumps(event),
             endpoint_id=endpoint_id,
-            mm_stream_type=str(type(mm_stream)),
+            mm_stream_type=str(type(self.model_monitoring_stream)),
         )
-        mm_stream.push([event], partition_key=endpoint_id)
+        self.model_monitoring_stream.push([event], partition_key=endpoint_id)
 def handler(context: nuclio_sdk.Context, event: nuclio_sdk.Event) -> None:

mlrun 1.8.0rc45__py3-none-any.whl → 1.8.0rc47__py3-none-any.whl

Potentially problematic release.

mlrun 1.8.0rc45py3-none-any.whl → 1.8.0rc47py3-none-any.whl