PyPI - mlrun - Versions diffs - 1.7.2rc3__py3-none-any.whl → 1.8.0__py3-none-any.whl - Mend

mlrun 1.7.2rc3py3-none-any.whl → 1.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (275) hide show

mlrun/__init__.py +26 -22
mlrun/__main__.py +15 -16
mlrun/alerts/alert.py +150 -15
mlrun/api/schemas/__init__.py +1 -9
mlrun/artifacts/__init__.py +2 -3
mlrun/artifacts/base.py +62 -19
mlrun/artifacts/dataset.py +17 -17
mlrun/artifacts/document.py +454 -0
mlrun/artifacts/manager.py +28 -18
mlrun/artifacts/model.py +91 -59
mlrun/artifacts/plots.py +2 -2
mlrun/common/constants.py +8 -0
mlrun/common/formatters/__init__.py +1 -0
mlrun/common/formatters/artifact.py +1 -1
mlrun/common/formatters/feature_set.py +2 -0
mlrun/common/formatters/function.py +1 -0
mlrun/{model_monitoring/db/stores/v3io_kv/__init__.py → common/formatters/model_endpoint.py} +17 -0
mlrun/common/formatters/pipeline.py +1 -2
mlrun/common/formatters/project.py +9 -0
mlrun/common/model_monitoring/__init__.py +0 -5
mlrun/common/model_monitoring/helpers.py +12 -62
mlrun/common/runtimes/constants.py +25 -4
mlrun/common/schemas/__init__.py +9 -5
mlrun/common/schemas/alert.py +114 -19
mlrun/common/schemas/api_gateway.py +3 -3
mlrun/common/schemas/artifact.py +22 -9
mlrun/common/schemas/auth.py +8 -4
mlrun/common/schemas/background_task.py +7 -7
mlrun/common/schemas/client_spec.py +4 -4
mlrun/common/schemas/clusterization_spec.py +2 -2
mlrun/common/schemas/common.py +53 -3
mlrun/common/schemas/constants.py +15 -0
mlrun/common/schemas/datastore_profile.py +1 -1
mlrun/common/schemas/feature_store.py +9 -9
mlrun/common/schemas/frontend_spec.py +4 -4
mlrun/common/schemas/function.py +10 -10
mlrun/common/schemas/hub.py +1 -1
mlrun/common/schemas/k8s.py +3 -3
mlrun/common/schemas/memory_reports.py +3 -3
mlrun/common/schemas/model_monitoring/__init__.py +4 -8
mlrun/common/schemas/model_monitoring/constants.py +127 -46
mlrun/common/schemas/model_monitoring/grafana.py +18 -12
mlrun/common/schemas/model_monitoring/model_endpoints.py +154 -160
mlrun/common/schemas/notification.py +24 -3
mlrun/common/schemas/object.py +1 -1
mlrun/common/schemas/pagination.py +4 -4
mlrun/common/schemas/partition.py +142 -0
mlrun/common/schemas/pipeline.py +3 -3
mlrun/common/schemas/project.py +26 -18
mlrun/common/schemas/runs.py +3 -3
mlrun/common/schemas/runtime_resource.py +5 -5
mlrun/common/schemas/schedule.py +1 -1
mlrun/common/schemas/secret.py +1 -1
mlrun/{model_monitoring/db/stores/sqldb/__init__.py → common/schemas/serving.py} +10 -1
mlrun/common/schemas/tag.py +3 -3
mlrun/common/schemas/workflow.py +6 -5
mlrun/common/types.py +1 -0
mlrun/config.py +157 -89
mlrun/data_types/__init__.py +5 -3
mlrun/data_types/infer.py +13 -3
mlrun/data_types/spark.py +2 -1
mlrun/datastore/__init__.py +59 -18
mlrun/datastore/alibaba_oss.py +4 -1
mlrun/datastore/azure_blob.py +4 -1
mlrun/datastore/base.py +19 -24
mlrun/datastore/datastore.py +10 -4
mlrun/datastore/datastore_profile.py +178 -45
mlrun/datastore/dbfs_store.py +4 -1
mlrun/datastore/filestore.py +4 -1
mlrun/datastore/google_cloud_storage.py +4 -1
mlrun/datastore/hdfs.py +4 -1
mlrun/datastore/inmem.py +4 -1
mlrun/datastore/redis.py +4 -1
mlrun/datastore/s3.py +14 -3
mlrun/datastore/sources.py +89 -92
mlrun/datastore/store_resources.py +7 -4
mlrun/datastore/storeytargets.py +51 -16
mlrun/datastore/targets.py +38 -31
mlrun/datastore/utils.py +87 -4
mlrun/datastore/v3io.py +4 -1
mlrun/datastore/vectorstore.py +291 -0
mlrun/datastore/wasbfs/fs.py +13 -12
mlrun/db/base.py +286 -100
mlrun/db/httpdb.py +1562 -490
mlrun/db/nopdb.py +250 -83
mlrun/errors.py +6 -2
mlrun/execution.py +194 -50
mlrun/feature_store/__init__.py +2 -10
mlrun/feature_store/api.py +20 -458
mlrun/feature_store/common.py +9 -9
mlrun/feature_store/feature_set.py +20 -18
mlrun/feature_store/feature_vector.py +105 -479
mlrun/feature_store/feature_vector_utils.py +466 -0
mlrun/feature_store/retrieval/base.py +15 -11
mlrun/feature_store/retrieval/job.py +2 -1
mlrun/feature_store/retrieval/storey_merger.py +1 -1
mlrun/feature_store/steps.py +3 -3
mlrun/features.py +30 -13
mlrun/frameworks/__init__.py +1 -2
mlrun/frameworks/_common/__init__.py +1 -2
mlrun/frameworks/_common/artifacts_library.py +2 -2
mlrun/frameworks/_common/mlrun_interface.py +10 -6
mlrun/frameworks/_common/model_handler.py +31 -31
mlrun/frameworks/_common/producer.py +3 -1
mlrun/frameworks/_dl_common/__init__.py +1 -2
mlrun/frameworks/_dl_common/loggers/__init__.py +1 -2
mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +4 -4
mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +3 -3
mlrun/frameworks/_ml_common/__init__.py +1 -2
mlrun/frameworks/_ml_common/loggers/__init__.py +1 -2
mlrun/frameworks/_ml_common/model_handler.py +21 -21
mlrun/frameworks/_ml_common/plans/__init__.py +1 -2
mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +3 -1
mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
mlrun/frameworks/auto_mlrun/__init__.py +1 -2
mlrun/frameworks/auto_mlrun/auto_mlrun.py +22 -15
mlrun/frameworks/huggingface/__init__.py +1 -2
mlrun/frameworks/huggingface/model_server.py +9 -9
mlrun/frameworks/lgbm/__init__.py +47 -44
mlrun/frameworks/lgbm/callbacks/__init__.py +1 -2
mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -2
mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -2
mlrun/frameworks/lgbm/mlrun_interfaces/__init__.py +1 -2
mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +5 -5
mlrun/frameworks/lgbm/model_handler.py +15 -11
mlrun/frameworks/lgbm/model_server.py +11 -7
mlrun/frameworks/lgbm/utils.py +2 -2
mlrun/frameworks/onnx/__init__.py +1 -2
mlrun/frameworks/onnx/dataset.py +3 -3
mlrun/frameworks/onnx/mlrun_interface.py +2 -2
mlrun/frameworks/onnx/model_handler.py +7 -5
mlrun/frameworks/onnx/model_server.py +8 -6
mlrun/frameworks/parallel_coordinates.py +11 -11
mlrun/frameworks/pytorch/__init__.py +22 -23
mlrun/frameworks/pytorch/callbacks/__init__.py +1 -2
mlrun/frameworks/pytorch/callbacks/callback.py +2 -1
mlrun/frameworks/pytorch/callbacks/logging_callback.py +15 -8
mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +19 -12
mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +22 -15
mlrun/frameworks/pytorch/callbacks_handler.py +36 -30
mlrun/frameworks/pytorch/mlrun_interface.py +17 -17
mlrun/frameworks/pytorch/model_handler.py +21 -17
mlrun/frameworks/pytorch/model_server.py +13 -9
mlrun/frameworks/sklearn/__init__.py +19 -18
mlrun/frameworks/sklearn/estimator.py +2 -2
mlrun/frameworks/sklearn/metric.py +3 -3
mlrun/frameworks/sklearn/metrics_library.py +8 -6
mlrun/frameworks/sklearn/mlrun_interface.py +3 -2
mlrun/frameworks/sklearn/model_handler.py +4 -3
mlrun/frameworks/tf_keras/__init__.py +11 -12
mlrun/frameworks/tf_keras/callbacks/__init__.py +1 -2
mlrun/frameworks/tf_keras/callbacks/logging_callback.py +17 -14
mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +15 -12
mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +21 -18
mlrun/frameworks/tf_keras/model_handler.py +17 -13
mlrun/frameworks/tf_keras/model_server.py +12 -8
mlrun/frameworks/xgboost/__init__.py +19 -18
mlrun/frameworks/xgboost/model_handler.py +13 -9
mlrun/k8s_utils.py +2 -5
mlrun/launcher/base.py +3 -4
mlrun/launcher/client.py +2 -2
mlrun/launcher/local.py +6 -2
mlrun/launcher/remote.py +1 -1
mlrun/lists.py +8 -4
mlrun/model.py +132 -46
mlrun/model_monitoring/__init__.py +3 -5
mlrun/model_monitoring/api.py +113 -98
mlrun/model_monitoring/applications/__init__.py +0 -5
mlrun/model_monitoring/applications/_application_steps.py +81 -50
mlrun/model_monitoring/applications/base.py +467 -14
mlrun/model_monitoring/applications/context.py +212 -134
mlrun/model_monitoring/{db/stores/base → applications/evidently}/__init__.py +6 -2
mlrun/model_monitoring/applications/evidently/base.py +146 -0
mlrun/model_monitoring/applications/histogram_data_drift.py +89 -56
mlrun/model_monitoring/applications/results.py +67 -15
mlrun/model_monitoring/controller.py +701 -315
mlrun/model_monitoring/db/__init__.py +0 -2
mlrun/model_monitoring/db/_schedules.py +242 -0
mlrun/model_monitoring/db/_stats.py +189 -0
mlrun/model_monitoring/db/tsdb/__init__.py +33 -22
mlrun/model_monitoring/db/tsdb/base.py +243 -49
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +76 -36
mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +33 -0
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +213 -0
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +534 -88
mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +1 -0
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +436 -106
mlrun/model_monitoring/helpers.py +356 -114
mlrun/model_monitoring/stream_processing.py +190 -345
mlrun/model_monitoring/tracking_policy.py +11 -4
mlrun/model_monitoring/writer.py +49 -90
mlrun/package/__init__.py +3 -6
mlrun/package/context_handler.py +2 -2
mlrun/package/packager.py +12 -9
mlrun/package/packagers/__init__.py +0 -2
mlrun/package/packagers/default_packager.py +14 -11
mlrun/package/packagers/numpy_packagers.py +16 -7
mlrun/package/packagers/pandas_packagers.py +18 -18
mlrun/package/packagers/python_standard_library_packagers.py +25 -11
mlrun/package/packagers_manager.py +35 -32
mlrun/package/utils/__init__.py +0 -3
mlrun/package/utils/_pickler.py +6 -6
mlrun/platforms/__init__.py +47 -16
mlrun/platforms/iguazio.py +4 -1
mlrun/projects/operations.py +30 -30
mlrun/projects/pipelines.py +116 -47
mlrun/projects/project.py +1292 -329
mlrun/render.py +5 -9
mlrun/run.py +57 -14
mlrun/runtimes/__init__.py +1 -3
mlrun/runtimes/base.py +30 -22
mlrun/runtimes/daskjob.py +9 -9
mlrun/runtimes/databricks_job/databricks_runtime.py +6 -5
mlrun/runtimes/function_reference.py +5 -2
mlrun/runtimes/generators.py +3 -2
mlrun/runtimes/kubejob.py +6 -7
mlrun/runtimes/mounts.py +574 -0
mlrun/runtimes/mpijob/__init__.py +0 -2
mlrun/runtimes/mpijob/abstract.py +7 -6
mlrun/runtimes/nuclio/api_gateway.py +7 -7
mlrun/runtimes/nuclio/application/application.py +11 -13
mlrun/runtimes/nuclio/application/reverse_proxy.go +66 -64
mlrun/runtimes/nuclio/function.py +127 -70
mlrun/runtimes/nuclio/serving.py +105 -37
mlrun/runtimes/pod.py +159 -54
mlrun/runtimes/remotesparkjob.py +3 -2
mlrun/runtimes/sparkjob/__init__.py +0 -2
mlrun/runtimes/sparkjob/spark3job.py +22 -12
mlrun/runtimes/utils.py +7 -6
mlrun/secrets.py +2 -2
mlrun/serving/__init__.py +8 -0
mlrun/serving/merger.py +7 -5
mlrun/serving/remote.py +35 -22
mlrun/serving/routers.py +186 -240
mlrun/serving/server.py +41 -10
mlrun/serving/states.py +432 -118
mlrun/serving/utils.py +13 -2
mlrun/serving/v1_serving.py +3 -2
mlrun/serving/v2_serving.py +161 -203
mlrun/track/__init__.py +1 -1
mlrun/track/tracker.py +2 -2
mlrun/track/trackers/mlflow_tracker.py +6 -5
mlrun/utils/async_http.py +35 -22
mlrun/utils/clones.py +7 -4
mlrun/utils/helpers.py +511 -58
mlrun/utils/logger.py +119 -13
mlrun/utils/notifications/notification/__init__.py +22 -19
mlrun/utils/notifications/notification/base.py +39 -15
mlrun/utils/notifications/notification/console.py +6 -6
mlrun/utils/notifications/notification/git.py +11 -11
mlrun/utils/notifications/notification/ipython.py +10 -9
mlrun/utils/notifications/notification/mail.py +176 -0
mlrun/utils/notifications/notification/slack.py +16 -8
mlrun/utils/notifications/notification/webhook.py +24 -8
mlrun/utils/notifications/notification_pusher.py +191 -200
mlrun/utils/regex.py +12 -2
mlrun/utils/version/version.json +2 -2
{mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/METADATA +81 -54
mlrun-1.8.0.dist-info/RECORD +351 -0
{mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/WHEEL +1 -1
mlrun/model_monitoring/applications/evidently_base.py +0 -137
mlrun/model_monitoring/db/stores/__init__.py +0 -136
mlrun/model_monitoring/db/stores/base/store.py +0 -213
mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -71
mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -190
mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +0 -103
mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -40
mlrun/model_monitoring/db/stores/sqldb/sql_store.py +0 -659
mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +0 -726
mlrun/model_monitoring/model_endpoint.py +0 -118
mlrun-1.7.2rc3.dist-info/RECORD +0 -351
{mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/entry_points.txt +0 -0
{mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info/licenses}/LICENSE +0 -0
{mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/top_level.txt +0 -0

mlrun/model_monitoring/controller.py CHANGED Viewed

@@ -11,31 +11,42 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import collections
 import concurrent.futures
 import datetime
 import json
 import os
-import re
+import traceback
+from collections import OrderedDict
 from collections.abc import Iterator
-from typing import NamedTuple, Optional, Union, cast
+from contextlib import AbstractContextManager
+from types import TracebackType
+from typing import Any, NamedTuple, Optional, Union, cast
-import nuclio
+import nuclio_sdk
+import pandas as pd
 import mlrun
 import mlrun.common.schemas.model_monitoring.constants as mm_constants
-import mlrun.data_types.infer
 import mlrun.feature_store as fstore
-import mlrun.model_monitoring.db.stores
-from mlrun.config import config as mlconf
-from mlrun.datastore import get_stream_pusher
-from mlrun.errors import err_to_str
-from mlrun.model_monitoring.helpers import (
-    _BatchDict,
-    batch_dict2timedelta,
-    get_stream_path,
+import mlrun.model_monitoring
+import mlrun.model_monitoring.db._schedules as schedules
+import mlrun.model_monitoring.helpers
+import mlrun.platforms.iguazio
+from mlrun.common.schemas import EndpointType
+from mlrun.common.schemas.model_monitoring.constants import (
+    ControllerEvent,
+    ControllerEventEndpointPolicy,
+    ControllerEventKind,
 )
+from mlrun.errors import err_to_str
+from mlrun.model_monitoring.helpers import batch_dict2timedelta
 from mlrun.utils import datetime_now, logger
+_SECONDS_IN_DAY = int(datetime.timedelta(days=1).total_seconds())
+_SECONDS_IN_MINUTE = 60
 class _Interval(NamedTuple):
     start: datetime.datetime
@@ -45,12 +56,12 @@ class _Interval(NamedTuple):
 class _BatchWindow:
     def __init__(
         self,
-        project: str,
-        endpoint: str,
+        *,
+        schedules_file: schedules.ModelMonitoringSchedulesFileEndpoint,
         application: str,
         timedelta_seconds: int,
-        last_updated: Optional[int],
-        first_request: Optional[int],
+        last_updated: int,
+        first_request: int,
     ) -> None:
         """
         Initialize a batch window object that handles the batch interval time range
@@ -58,159 +69,133 @@ class _BatchWindow:
         All the time values are in seconds.
         The start and stop time are in seconds since the epoch.
         """
-        self.project = project
-        self._endpoint = endpoint
         self._application = application
         self._first_request = first_request
         self._stop = last_updated
         self._step = timedelta_seconds
-        self._db = mlrun.model_monitoring.get_store_object(project=self.project)
+        self._db = schedules_file
         self._start = self._get_last_analyzed()
-    def _get_last_analyzed(self) -> Optional[int]:
-        try:
-            last_analyzed = self._db.get_last_analyzed(
-                endpoint_id=self._endpoint,
-                application_name=self._application,
-            )
-        except mlrun.errors.MLRunNotFoundError:
-            logger.info(
-                "No last analyzed time was found for this endpoint and "
-                "application, as this is probably the first time this "
-                "application is running. Using the latest between first "
-                "request time or last update time minus one day instead",
-                endpoint=self._endpoint,
-                application=self._application,
-                first_request=self._first_request,
-                last_updated=self._stop,
-            )
+    def _get_saved_last_analyzed(self) -> Optional[int]:
+        return cast(int, self._db.get_application_time(self._application))
-            if self._first_request and self._stop:
-                # TODO : Change the timedelta according to the policy.
-                first_period_in_seconds = max(
-                    int(datetime.timedelta(days=1).total_seconds()), self._step
-                )  # max between one day and the base period
-                return max(
-                    self._first_request,
-                    self._stop - first_period_in_seconds,
-                )
-            return self._first_request
-        logger.info(
-            "Got the last analyzed time for this endpoint and application",
-            endpoint=self._endpoint,
-            application=self._application,
-            last_analyzed=last_analyzed,
+    def _update_last_analyzed(self, last_analyzed: int) -> None:
+        self._db.update_application_time(
+            application=self._application, timestamp=last_analyzed
         )
-        return last_analyzed
-    def _update_last_analyzed(self, last_analyzed: int) -> None:
+    def _get_initial_last_analyzed(self) -> int:
         logger.info(
-            "Updating the last analyzed time for this endpoint and application",
-            endpoint=self._endpoint,
+            "No last analyzed time was found for this endpoint and application, as this is "
+            "probably the first time this application is running. Initializing last analyzed "
+            "to the latest between first request time or last update time minus one day",
             application=self._application,
-            last_analyzed=last_analyzed,
+            first_request=self._first_request,
+            last_updated=self._stop,
         )
-        self._db.update_last_analyzed(
-            endpoint_id=self._endpoint,
-            application_name=self._application,
-            last_analyzed=last_analyzed,
+        # max between one day and the base period
+        first_period_in_seconds = max(_SECONDS_IN_DAY, self._step)
+        return max(
+            self._first_request,
+            self._stop - first_period_in_seconds,
         )
-    def get_intervals(
-        self,
-    ) -> Iterator[_Interval]:
-        """Generate the batch interval time ranges."""
-        if self._start is not None and self._stop is not None:
-            entered = False
-            # Iterate timestamp from start until timestamp <= stop - step
-            # so that the last interval will end at (timestamp + step) <= stop.
-            # Add 1 to stop - step to get <= and not <.
-            for timestamp in range(
-                self._start, self._stop - self._step + 1, self._step
-            ):
-                entered = True
-                start_time = datetime.datetime.fromtimestamp(
-                    timestamp, tz=datetime.timezone.utc
-                )
-                end_time = datetime.datetime.fromtimestamp(
-                    timestamp + self._step, tz=datetime.timezone.utc
-                )
-                yield _Interval(start_time, end_time)
-                self._update_last_analyzed(timestamp + self._step)
-            if not entered:
-                logger.info(
-                    "All the data is set, but no complete intervals were found. "
-                    "Wait for last_updated to be updated",
-                    endpoint=self._endpoint,
-                    application=self._application,
-                    start=self._start,
-                    stop=self._stop,
-                    step=self._step,
-                )
+    def _get_last_analyzed(self) -> int:
+        saved_last_analyzed = self._get_saved_last_analyzed()
+        if saved_last_analyzed is not None:
+            return saved_last_analyzed
         else:
-            logger.warn(
-                "The first request time is not found for this endpoint. "
-                "No intervals will be generated",
-                endpoint=self._endpoint,
+            last_analyzed = self._get_initial_last_analyzed()
+            # Update the in-memory DB to avoid duplicate initializations
+            self._update_last_analyzed(last_analyzed)
+        return last_analyzed
+    def get_intervals(self) -> Iterator[_Interval]:
+        """Generate the batch interval time ranges."""
+        entered = False
+        # Iterate timestamp from start until timestamp <= stop - step
+        # so that the last interval will end at (timestamp + step) <= stop.
+        # Add 1 to stop - step to get <= and not <.
+        for timestamp in range(self._start, self._stop - self._step + 1, self._step):
+            entered = True
+            start_time = datetime.datetime.fromtimestamp(
+                timestamp, tz=datetime.timezone.utc
+            )
+            end_time = datetime.datetime.fromtimestamp(
+                timestamp + self._step, tz=datetime.timezone.utc
+            )
+            yield _Interval(start_time, end_time)
+            last_analyzed = timestamp + self._step
+            self._update_last_analyzed(last_analyzed)
+            logger.debug(
+                "Updated the last analyzed time for this endpoint and application",
+                application=self._application,
+                last_analyzed=last_analyzed,
+            )
+        if not entered:
+            logger.debug(
+                "All the data is set, but no complete intervals were found. "
+                "Wait for last_updated to be updated",
                 application=self._application,
                 start=self._start,
                 stop=self._stop,
+                step=self._step,
             )
-class _BatchWindowGenerator:
-    def __init__(self, batch_dict: Union[dict, str]) -> None:
+class _BatchWindowGenerator(AbstractContextManager):
+    def __init__(
+        self, project: str, endpoint_id: str, window_length: Optional[int] = None
+    ) -> None:
         """
         Initialize a batch window generator object that generates batch window objects
         for the monitoring functions.
         """
-        self._batch_dict = batch_dict
-        self._norm_batch_dict()
-        self._timedelta = self._get_timedelta()
-    def _norm_batch_dict(self) -> None:
-        # TODO: This will be removed once the job params can be parsed with different types
-        # Convert batch dict string into a dictionary
-        if isinstance(self._batch_dict, str):
-            self._parse_batch_dict_str()
-    def _parse_batch_dict_str(self) -> None:
-        """Convert batch dictionary string into a valid dictionary"""
-        characters_to_remove = "{} "
-        pattern = "[" + characters_to_remove + "]"
-        # Remove unnecessary characters from the provided string
-        batch_list = re.sub(pattern, "", self._batch_dict).split(",")
-        # Initialize the dictionary of batch interval ranges
-        self._batch_dict = {}
-        for pair in batch_list:
-            pair_list = pair.split(":")
-            self._batch_dict[pair_list[0]] = float(pair_list[1])
-    def _get_timedelta(self) -> int:
-        """Get the timedelta in seconds from the batch dictionary"""
-        return int(
-            batch_dict2timedelta(cast(_BatchDict, self._batch_dict)).total_seconds()
+        self.batch_window: _BatchWindow = None
+        self._project = project
+        self._endpoint_id = endpoint_id
+        self._timedelta = window_length
+        self._schedules_file = schedules.ModelMonitoringSchedulesFileEndpoint(
+            project=project, endpoint_id=endpoint_id
         )
+    def __enter__(self) -> "_BatchWindowGenerator":
+        self._schedules_file.__enter__()
+        return super().__enter__()
+    def __exit__(
+        self,
+        exc_type: Optional[type[BaseException]],
+        exc_value: Optional[BaseException],
+        traceback: Optional[TracebackType],
+    ) -> Optional[bool]:
+        self._schedules_file.__exit__(
+            exc_type=exc_type, exc_value=exc_value, traceback=traceback
+        )
+    def get_application_list(self) -> set[str]:
+        return self._schedules_file.get_application_list()
+    def get_min_last_analyzed(self) -> Optional[int]:
+        return self._schedules_file.get_min_timestamp()
     @classmethod
     def _get_last_updated_time(
-        cls, last_request: Optional[str], has_stream: bool
-    ) -> Optional[int]:
+        cls, last_request: datetime.datetime, not_batch_endpoint: bool
+    ) -> int:
         """
         Get the last updated time of a model endpoint.
         """
-        if not last_request:
-            return None
         last_updated = int(
-            cls._date_string2timestamp(last_request)
+            last_request.timestamp()
             - cast(
                 float,
                 mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs,
             )
         )
-        if not has_stream:
+        if not not_batch_endpoint:
             # If the endpoint does not have a stream, `last_updated` should be
             # the minimum between the current time and the last updated time.
             # This compensates for the bumping mechanism - see
@@ -221,45 +206,38 @@ class _BatchWindowGenerator:
             )
         return last_updated
-    @classmethod
-    def _normalize_first_request(
-        cls, first_request: Optional[str], endpoint: str
-    ) -> Optional[int]:
-        if not first_request:
-            logger.debug(
-                "There is no first request time for this endpoint.",
-                endpoint=endpoint,
-                first_request=first_request,
-            )
-            return None
-        return cls._date_string2timestamp(first_request)
-    @staticmethod
-    def _date_string2timestamp(date_string: str) -> int:
-        return int(datetime.datetime.fromisoformat(date_string).timestamp())
-    def get_batch_window(
+    def get_intervals(
         self,
-        project: str,
-        endpoint: str,
+        *,
         application: str,
-        first_request: Optional[str],
-        last_request: Optional[str],
-        has_stream: bool,
-    ) -> _BatchWindow:
+        first_request: datetime.datetime,
+        last_request: datetime.datetime,
+        not_batch_endpoint: bool,
+    ) -> Iterator[_Interval]:
         """
         Get the batch window for a specific endpoint and application.
-        first_request is the first request time to the endpoint.
+        `first_request` and `last_request` are the timestamps of the first request and last
+        request to the endpoint, respectively. They are guaranteed to be nonempty at this point.
         """
-        return _BatchWindow(
-            project=project,
-            endpoint=endpoint,
+        self.batch_window = _BatchWindow(
+            schedules_file=self._schedules_file,
             application=application,
             timedelta_seconds=self._timedelta,
-            last_updated=self._get_last_updated_time(last_request, has_stream),
-            first_request=self._normalize_first_request(first_request, endpoint),
+            last_updated=self._get_last_updated_time(last_request, not_batch_endpoint),
+            first_request=int(first_request.timestamp()),
         )
+        yield from self.batch_window.get_intervals()
+def _get_window_length() -> int:
+    """Get the timedelta in seconds from the batch dictionary"""
+    return int(
+        batch_dict2timedelta(
+            json.loads(
+                cast(str, os.getenv(mm_constants.EventFieldType.BATCH_INTERVALS_DICT))
+            )
+        ).total_seconds()
+    )
 class MonitoringApplicationController:
@@ -269,27 +247,79 @@ class MonitoringApplicationController:
     Note that the MonitoringApplicationController object requires access keys along with valid project configurations.
     """
+    _MAX_FEATURE_SET_PER_WORKER = 1000
     def __init__(self) -> None:
         """Initialize Monitoring Application Controller"""
         self.project = cast(str, mlrun.mlconf.default_project)
-        self.project_obj = mlrun.load_project(name=self.project, url=self.project)
+        self.project_obj = mlrun.get_run_db().get_project(name=self.project)
         logger.debug(f"Initializing {self.__class__.__name__}", project=self.project)
-        self.db = mlrun.model_monitoring.get_store_object(project=self.project)
+        self._window_length = _get_window_length()
-        self._batch_window_generator = _BatchWindowGenerator(
-            batch_dict=json.loads(
-                mlrun.get_secret_or_env(
-                    mm_constants.EventFieldType.BATCH_INTERVALS_DICT
-                )
-            )
+        self.model_monitoring_access_key = self._get_model_monitoring_access_key()
+        self.v3io_access_key = mlrun.mlconf.get_v3io_access_key()
+        store, _, _ = mlrun.store_manager.get_or_create_store(
+            mlrun.mlconf.artifact_path
+        )
+        self.storage_options = store.get_storage_options()
+        self._controller_stream: Optional[
+            Union[
+                mlrun.platforms.iguazio.OutputStream,
+                mlrun.platforms.iguazio.KafkaOutputStream,
+            ]
+        ] = None
+        self._model_monitoring_stream: Optional[
+            Union[
+                mlrun.platforms.iguazio.OutputStream,
+                mlrun.platforms.iguazio.KafkaOutputStream,
+            ]
+        ] = None
+        self.applications_streams: dict[
+            str,
+            Union[
+                mlrun.platforms.iguazio.OutputStream,
+                mlrun.platforms.iguazio.KafkaOutputStream,
+            ],
+        ] = {}
+        self.feature_sets: OrderedDict[str, mlrun.feature_store.FeatureSet] = (
+            collections.OrderedDict()
+        )
+        self.tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
+            project=self.project
         )
-        self.model_monitoring_access_key = self._get_model_monitoring_access_key()
-        self.storage_options = None
-        if mlconf.artifact_path.startswith("s3://"):
-            self.storage_options = mlrun.mlconf.get_s3_storage_options()
+    @property
+    def controller_stream(
+        self,
+    ) -> Union[
+        mlrun.platforms.iguazio.OutputStream,
+        mlrun.platforms.iguazio.KafkaOutputStream,
+    ]:
+        if self._controller_stream is None:
+            self._controller_stream = mlrun.model_monitoring.helpers.get_output_stream(
+                project=self.project,
+                function_name=mm_constants.MonitoringFunctionNames.APPLICATION_CONTROLLER,
+                v3io_access_key=self.v3io_access_key,
+            )
+        return self._controller_stream
+    @property
+    def model_monitoring_stream(
+        self,
+    ) -> Union[
+        mlrun.platforms.iguazio.OutputStream,
+        mlrun.platforms.iguazio.KafkaOutputStream,
+    ]:
+        if self._model_monitoring_stream is None:
+            self._model_monitoring_stream = (
+                mlrun.model_monitoring.helpers.get_output_stream(
+                    project=self.project,
+                    function_name=mm_constants.MonitoringFunctionNames.STREAM,
+                    v3io_access_key=self.model_monitoring_access_key,
+                )
+            )
+        return self._model_monitoring_stream
     @staticmethod
     def _get_model_monitoring_access_key() -> Optional[str]:
@@ -299,168 +329,302 @@ class MonitoringApplicationController:
             access_key = mlrun.mlconf.get_v3io_access_key()
         return access_key
-    def run(self) -> None:
+    def _should_monitor_endpoint(
+        self,
+        endpoint: mlrun.common.schemas.ModelEndpoint,
+        application_names: set,
+        base_period_minutes: int,
+        schedules_file: schedules.ModelMonitoringSchedulesFileChief,
+    ) -> bool:
         """
-        Main method for run all the relevant monitoring applications on each endpoint.
-        This method handles the following:
-        1. List model endpoints
-        2. List applications
-        3. Check model monitoring windows
-        4. Send data to applications
-        5. Delete old parquets
+        checks if there is a need to monitor the given endpoint, we should monitor endpoint if it stands in the
+        next conditions:
+            1.  monitoring_mode is enabled
+            2.  first request exists
+            3.  last request exists
+            4.  endpoint_type is not ROUTER
+        if the four above conditions apply we require one of the two condition monitor:
+            1.  never monitored the one of the endpoint applications meaning min_last_analyzed is None
+            2.  min_last_analyzed stands in the condition for sending NOP event and this the first time regular event
+            is sent with the combination of  current last_request  & current last_analyzed  per endpoint.
         """
-        logger.info("Start running monitoring controller")
-        try:
-            applications_names = []
-            endpoints = self.db.list_model_endpoints(include_stats=True)
-            if not endpoints:
-                logger.info("No model endpoints found", project=self.project)
-                return
-            monitoring_functions = self.project_obj.list_model_monitoring_functions()
-            if monitoring_functions:
-                applications_names = list(
-                    {app.metadata.name for app in monitoring_functions}
+        last_timestamp_sent = schedules_file.get_endpoint_last_request(
+            endpoint.metadata.uid
+        )
+        last_analyzed_sent = schedules_file.get_endpoint_last_analyzed(
+            endpoint.metadata.uid
+        )
+        logger.debug(
+            "Chief should monitor endpoint check",
+            last_timestamp_sent=last_timestamp_sent,
+            last_analyzed_sent=last_analyzed_sent,
+            uid=endpoint.metadata.uid,
+        )
+        if (
+            # Is the model endpoint monitored?
+            endpoint.status.monitoring_mode == mm_constants.ModelMonitoringMode.enabled
+            # Was the model endpoint called? I.e., are the first and last requests nonempty?
+            and endpoint.status.first_request
+            and endpoint.status.last_request
+            # Is the model endpoint not a router endpoint? Router endpoint has no feature stats
+            and endpoint.metadata.endpoint_type.value
+            != mm_constants.EndpointType.ROUTER.value
+        ):
+            with _BatchWindowGenerator(
+                project=endpoint.metadata.project,
+                endpoint_id=endpoint.metadata.uid,
+            ) as batch_window_generator:
+                current_time = mlrun.utils.datetime_now()
+                current_min_last_analyzed = (
+                    batch_window_generator.get_min_last_analyzed()
                 )
-            # if monitoring_functions: - TODO : ML-7700
-            #   Gets only application in ready state
-            #   applications_names = list(
-            #       {
-            #           app.metadata.name
-            #           for app in monitoring_functions
-            #           if (
-            #               app.status.state == "ready"
-            #               # workaround for the default app, as its `status.state` is `None`
-            #               or app.metadata.name
-            #               == mm_constants.HistogramDataDriftApplicationConstants.NAME
-            #           )
-            #       }
-            #   )
-            if not applications_names:
-                logger.info("No monitoring functions found", project=self.project)
-                return
+                if (
+                    # Different application names, or last analyzed never updated while there are application to monitor
+                    application_names
+                    and (
+                        application_names
+                        != batch_window_generator.get_application_list()
+                        or not current_min_last_analyzed
+                    )
+                ):
+                    return True
+                elif (
+                    # Does nop event will be sent to close the relevant window
+                    self._should_send_nop_event(
+                        base_period_minutes, current_min_last_analyzed, current_time
+                    )
+                    and (
+                        int(endpoint.status.last_request.timestamp())
+                        != last_timestamp_sent
+                        or current_min_last_analyzed != last_analyzed_sent
+                    )
+                ):
+                    # Write to schedule chief file the last_request, min_last_analyzed we pushed event to stream
+                    schedules_file.update_endpoint_timestamps(
+                        endpoint_uid=endpoint.metadata.uid,
+                        last_request=int(endpoint.status.last_request.timestamp()),
+                        last_analyzed=current_min_last_analyzed,
+                    )
+                    return True
+                else:
+                    logger.info(
+                        "All the possible intervals were already analyzed, didn't push regular event",
+                        endpoint_id=endpoint.metadata.uid,
+                        last_analyzed=current_min_last_analyzed,
+                        last_request=endpoint.status.last_request,
+                    )
+        else:
             logger.info(
-                "Starting to iterate over the applications",
-                applications=applications_names,
+                "Should not monitor model endpoint, didn't push regular event",
+                endpoint_id=endpoint.metadata.uid,
+                endpoint_name=endpoint.metadata.name,
+                last_request=endpoint.status.last_request,
+                first_request=endpoint.status.first_request,
+                endpoint_type=endpoint.metadata.endpoint_type,
+                feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
             )
+        return False
+    @staticmethod
+    def _should_send_nop_event(
+        base_period_minutes: int,
+        min_last_analyzed: int,
+        current_time: datetime.datetime,
+    ):
+        if min_last_analyzed:
+            return (
+                current_time.timestamp() - min_last_analyzed
+                >= datetime.timedelta(minutes=base_period_minutes).total_seconds()
+                + mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs
+            )
+        else:
+            return True
+    def run(self, event: nuclio_sdk.Event) -> None:
+        """
+        Main method for controller chief, runs all the relevant monitoring applications for a single endpoint.
+        Handles nop events logic.
+        This method handles the following:
+        1. Read applications from the event (endpoint_policy)
+        2. Check model monitoring windows
+        3. Send data to applications
+        4. Pushes nop event to main stream if needed
+        """
+        logger.info("Start running monitoring controller worker")
+        try:
+            body = json.loads(event.body.decode("utf-8"))
         except Exception as e:
             logger.error(
-                "Failed to list endpoints and monitoring applications",
+                "Failed to decode event",
                 exc=err_to_str(e),
             )
             return
-        # Initialize a process pool that will be used to run each endpoint applications on a dedicated process
-        with concurrent.futures.ThreadPoolExecutor(
-            max_workers=min(len(endpoints), 10),
-        ) as pool:
-            for endpoint in endpoints:
-                if (
-                    endpoint[mm_constants.EventFieldType.ACTIVE]
-                    and endpoint[mm_constants.EventFieldType.MONITORING_MODE]
-                    == mm_constants.ModelMonitoringMode.enabled.value
-                ):
-                    # Skip router endpoint:
-                    if (
-                        int(endpoint[mm_constants.EventFieldType.ENDPOINT_TYPE])
-                        == mm_constants.EndpointType.ROUTER
-                    ):
-                        # Router endpoint has no feature stats
-                        logger.info(
-                            f"{endpoint[mm_constants.EventFieldType.UID]} is router, skipping"
-                        )
-                        continue
-                    pool.submit(
-                        MonitoringApplicationController.model_endpoint_process,
-                        endpoint=endpoint,
-                        applications_names=applications_names,
-                        batch_window_generator=self._batch_window_generator,
-                        project=self.project,
-                        model_monitoring_access_key=self.model_monitoring_access_key,
-                        storage_options=self.storage_options,
-                    )
+        # Run single endpoint process
+        self.model_endpoint_process(event=body)
-    @classmethod
     def model_endpoint_process(
-        cls,
-        endpoint: dict,
-        applications_names: list[str],
-        batch_window_generator: _BatchWindowGenerator,
-        project: str,
-        model_monitoring_access_key: str,
-        storage_options: Optional[dict] = None,
+        self,
+        event: Optional[dict] = None,
     ) -> None:
         """
         Process a model endpoint and trigger the monitoring applications. This function running on different process
-        for each endpoint. In addition, this function will generate a parquet file that includes the relevant data
-        for a specific time range.
-        :param endpoint:                    (dict) Model endpoint record.
-        :param applications_names:          (list[str]) List of application names to push results to.
-        :param batch_window_generator:      (_BatchWindowGenerator) An object that generates _BatchWindow objects.
-        :param project:                     (str) Project name.
-        :param model_monitoring_access_key: (str) Access key to apply the model monitoring process.
-        :param storage_options:             (dict) Storage options for reading the infer parquet files.
+        for each endpoint.
+        :param event:                       (dict) Event that triggered the monitoring process.
         """
-        endpoint_id = endpoint[mm_constants.EventFieldType.UID]
-        has_stream = endpoint[mm_constants.EventFieldType.STREAM_PATH] != ""
-        m_fs = fstore.get_feature_set(
-            endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
-        )
+        logger.info("Model endpoint process started", event=event)
         try:
-            for application in applications_names:
-                batch_window = batch_window_generator.get_batch_window(
-                    project=project,
-                    endpoint=endpoint_id,
-                    application=application,
-                    first_request=endpoint[mm_constants.EventFieldType.FIRST_REQUEST],
-                    last_request=endpoint[mm_constants.EventFieldType.LAST_REQUEST],
-                    has_stream=has_stream,
-                )
+            project_name = event[ControllerEvent.PROJECT]
+            endpoint_id = event[ControllerEvent.ENDPOINT_ID]
+            endpoint_name = event[ControllerEvent.ENDPOINT_NAME]
+            applications_names = event[ControllerEvent.ENDPOINT_POLICY][
+                ControllerEventEndpointPolicy.MONITORING_APPLICATIONS
+            ]
+            not_batch_endpoint = (
+                event[ControllerEvent.ENDPOINT_TYPE] != EndpointType.BATCH_EP
+            )
-                for start_infer_time, end_infer_time in batch_window.get_intervals():
-                    df = m_fs.to_dataframe(
-                        start_time=start_infer_time,
-                        end_time=end_infer_time,
-                        time_column=mm_constants.EventFieldType.TIMESTAMP,
-                        storage_options=storage_options,
+            logger.info(
+                "Starting analyzing for", timestamp=event[ControllerEvent.TIMESTAMP]
+            )
+            last_stream_timestamp = datetime.datetime.fromisoformat(
+                event[ControllerEvent.TIMESTAMP]
+            )
+            first_request = datetime.datetime.fromisoformat(
+                event[ControllerEvent.FIRST_REQUEST]
+            )
+            with _BatchWindowGenerator(
+                project=project_name,
+                endpoint_id=endpoint_id,
+                window_length=self._window_length,
+            ) as batch_window_generator:
+                for application in applications_names:
+                    for (
+                        start_infer_time,
+                        end_infer_time,
+                    ) in batch_window_generator.get_intervals(
+                        application=application,
+                        not_batch_endpoint=not_batch_endpoint,
+                        first_request=first_request,
+                        last_request=last_stream_timestamp,
+                    ):
+                        data_in_window = False
+                        if not_batch_endpoint:
+                            # Serving endpoint - get the relevant window data from the TSDB
+                            prediction_metric = self.tsdb_connector.read_predictions(
+                                start=start_infer_time,
+                                end=end_infer_time,
+                                endpoint_id=endpoint_id,
+                            )
+                            if prediction_metric.data:
+                                data_in_window = True
+                        else:
+                            if endpoint_id not in self.feature_sets:
+                                self.feature_sets[endpoint_id] = fstore.get_feature_set(
+                                    event[ControllerEvent.FEATURE_SET_URI]
+                                )
+                            self.feature_sets.move_to_end(endpoint_id, last=False)
+                            if (
+                                len(self.feature_sets)
+                                > self._MAX_FEATURE_SET_PER_WORKER
+                            ):
+                                self.feature_sets.popitem(last=True)
+                            m_fs = self.feature_sets.get(endpoint_id)
+                            # Batch endpoint - get the relevant window data from the parquet target
+                            df = m_fs.to_dataframe(
+                                start_time=start_infer_time,
+                                end_time=end_infer_time,
+                                time_column=mm_constants.EventFieldType.TIMESTAMP,
+                                storage_options=self.storage_options,
+                            )
+                            if len(df) > 0:
+                                data_in_window = True
+                        if not data_in_window:
+                            logger.info(
+                                "No data found for the given interval",
+                                start=start_infer_time,
+                                end=end_infer_time,
+                                endpoint_id=endpoint_id,
+                            )
+                        else:
+                            logger.info(
+                                "Data found for the given interval",
+                                start=start_infer_time,
+                                end=end_infer_time,
+                                endpoint_id=endpoint_id,
+                            )
+                            self._push_to_applications(
+                                start_infer_time=start_infer_time,
+                                end_infer_time=end_infer_time,
+                                endpoint_id=endpoint_id,
+                                endpoint_name=endpoint_name,
+                                project=project_name,
+                                applications_names=[application],
+                                model_monitoring_access_key=self.model_monitoring_access_key,
+                                endpoint_updated=event[ControllerEvent.ENDPOINT_POLICY][
+                                    ControllerEventEndpointPolicy.ENDPOINT_UPDATED
+                                ],
+                            )
+                base_period = event[ControllerEvent.ENDPOINT_POLICY][
+                    ControllerEventEndpointPolicy.BASE_PERIOD
+                ]
+                current_time = mlrun.utils.datetime_now()
+                if (
+                    self._should_send_nop_event(
+                        base_period,
+                        batch_window_generator.get_min_last_analyzed(),
+                        current_time,
                     )
-                    if len(df) == 0:
-                        logger.info(
-                            "No data found for the given interval",
-                            start=start_infer_time,
-                            end=end_infer_time,
-                            endpoint_id=endpoint_id,
-                        )
-                    else:
-                        logger.info(
-                            "Data found for the given interval",
-                            start=start_infer_time,
-                            end=end_infer_time,
-                            endpoint_id=endpoint_id,
-                        )
-                        cls._push_to_applications(
-                            start_infer_time=start_infer_time,
-                            end_infer_time=end_infer_time,
-                            endpoint_id=endpoint_id,
-                            project=project,
-                            applications_names=[application],
-                            model_monitoring_access_key=model_monitoring_access_key,
-                        )
+                    and event[ControllerEvent.KIND] != ControllerEventKind.NOP_EVENT
+                ):
+                    event = {
+                        ControllerEvent.KIND: mm_constants.ControllerEventKind.NOP_EVENT,
+                        ControllerEvent.PROJECT: project_name,
+                        ControllerEvent.ENDPOINT_ID: endpoint_id,
+                        ControllerEvent.ENDPOINT_NAME: endpoint_name,
+                        ControllerEvent.TIMESTAMP: current_time.isoformat(
+                            timespec="microseconds"
+                        ),
+                        ControllerEvent.ENDPOINT_POLICY: event[
+                            ControllerEvent.ENDPOINT_POLICY
+                        ],
+                        ControllerEvent.ENDPOINT_TYPE: event[
+                            ControllerEvent.ENDPOINT_TYPE
+                        ],
+                        ControllerEvent.FEATURE_SET_URI: event[
+                            ControllerEvent.FEATURE_SET_URI
+                        ],
+                        ControllerEvent.FIRST_REQUEST: event[
+                            ControllerEvent.FIRST_REQUEST
+                        ],
+                    }
+                    self._push_to_main_stream(
+                        event=event,
+                        endpoint_id=endpoint_id,
+                    )
+            logger.info(
+                "Finish analyze for", timestamp=event[ControllerEvent.TIMESTAMP]
+            )
         except Exception:
             logger.exception(
                 "Encountered an exception",
-                endpoint_id=endpoint[mm_constants.EventFieldType.UID],
+                endpoint_id=event[ControllerEvent.ENDPOINT_ID],
             )
-    @staticmethod
     def _push_to_applications(
+        self,
         start_infer_time: datetime.datetime,
         end_infer_time: datetime.datetime,
         endpoint_id: str,
+        endpoint_name: str,
         project: str,
         applications_names: list[str],
         model_monitoring_access_key: str,
+        endpoint_updated: str,
     ):
         """
         Pushes data to multiple stream applications.
@@ -471,7 +635,7 @@ class MonitoringApplicationController:
         :param project: mlrun               Project name.
         :param applications_names:          List of application names to which data will be pushed.
         :param model_monitoring_access_key: Access key to apply the model monitoring process.
+        :param endpoint_updated:            str isoformet for the timestamp the model endpoint was updated
         """
         data = {
             mm_constants.ApplicationEvent.START_INFER_TIME: start_infer_time.isoformat(
@@ -481,28 +645,250 @@ class MonitoringApplicationController:
                 sep=" ", timespec="microseconds"
             ),
             mm_constants.ApplicationEvent.ENDPOINT_ID: endpoint_id,
-            mm_constants.ApplicationEvent.OUTPUT_STREAM_URI: get_stream_path(
-                project=project,
-                function_name=mm_constants.MonitoringFunctionNames.WRITER,
-            ),
+            mm_constants.ApplicationEvent.ENDPOINT_NAME: endpoint_name,
+            mm_constants.ApplicationEvent.ENDPOINT_UPDATED: endpoint_updated,
         }
         for app_name in applications_names:
             data.update({mm_constants.ApplicationEvent.APPLICATION_NAME: app_name})
-            stream_uri = get_stream_path(project=project, function_name=app_name)
+            if app_name not in self.applications_streams:
+                self.applications_streams[app_name] = (
+                    mlrun.model_monitoring.helpers.get_output_stream(
+                        project=project,
+                        function_name=app_name,
+                        v3io_access_key=model_monitoring_access_key,
+                    )
+                )
+            app_stream = self.applications_streams.get(app_name)
             logger.info(
-                f"push endpoint_id {endpoint_id} to {app_name} by stream :{stream_uri}"
+                "Pushing data to application stream",
+                endpoint_id=endpoint_id,
+                app_name=app_name,
+                app_stream_type=str(type(app_stream)),
+            )
+            app_stream.push([data], partition_key=endpoint_id)
+    def push_regular_event_to_controller_stream(self) -> None:
+        """
+        pushes a regular event to the controller stream.
+        """
+        logger.info("Starting monitoring controller chief")
+        applications_names = []
+        endpoints = self.project_obj.list_model_endpoints(tsdb_metrics=False).endpoints
+        last_request_dict = self.tsdb_connector.get_last_request(
+            endpoint_ids=[mep.metadata.uid for mep in endpoints]
+        )
+        if isinstance(last_request_dict, pd.DataFrame):
+            last_request_dict = last_request_dict.set_index(
+                mm_constants.EventFieldType.ENDPOINT_ID
+            )[mm_constants.ModelEndpointSchema.LAST_REQUEST].to_dict()
+        if not endpoints:
+            logger.info("No model endpoints found", project=self.project)
+            return
+        monitoring_functions = self.project_obj.list_model_monitoring_functions()
+        if monitoring_functions:
+            # if monitoring_functions: - TODO : ML-7700
+            #   Gets only application in ready state
+            #   applications_names = list(
+            #       {
+            #           app.metadata.name
+            #           for app in monitoring_functions
+            #           if (
+            #               app.status.state == "ready"
+            #               # workaround for the default app, as its `status.state` is `None`
+            #               or app.metadata.name
+            #               == mm_constants.HistogramDataDriftApplicationConstants.NAME
+            #           )
+            #       }
+            #   )
+            applications_names = list(
+                {app.metadata.name for app in monitoring_functions}
+            )
+        if not applications_names:
+            logger.info("No monitoring functions found", project=self.project)
+            return
+        policy = {
+            ControllerEventEndpointPolicy.MONITORING_APPLICATIONS: applications_names,
+            ControllerEventEndpointPolicy.BASE_PERIOD: int(
+                batch_dict2timedelta(
+                    json.loads(
+                        cast(
+                            str,
+                            os.getenv(mm_constants.EventFieldType.BATCH_INTERVALS_DICT),
+                        )
+                    )
+                ).total_seconds()
+                // _SECONDS_IN_MINUTE
+            ),
+        }
+        with concurrent.futures.ThreadPoolExecutor(
+            max_workers=min(len(endpoints), 10)
+        ) as pool:
+            with schedules.ModelMonitoringSchedulesFileChief(
+                self.project
+            ) as schedule_file:
+                for endpoint in endpoints:
+                    last_request = last_request_dict.get(endpoint.metadata.uid, None)
+                    if isinstance(last_request, float):
+                        last_request = pd.to_datetime(last_request, unit="s", utc=True)
+                    endpoint.status.last_request = (
+                        last_request or endpoint.status.last_request
+                    )
+                    futures = {
+                        pool.submit(
+                            self.endpoint_to_regular_event,
+                            endpoint,
+                            policy,
+                            set(applications_names),
+                            schedule_file,
+                        ): endpoint
+                    }
+                for future in concurrent.futures.as_completed(futures):
+                    if future.exception():
+                        exception = future.exception()
+                        error = (
+                            f"Failed to push event. Endpoint name: {futures[future].metadata.name}, "
+                            f"endpoint uid: {futures[future].metadata.uid}, traceback:\n"
+                        )
+                        error += "".join(
+                            traceback.format_exception(
+                                None, exception, exception.__traceback__
+                            )
+                        )
+                        logger.error(error)
+        logger.info("Finishing monitoring controller chief")
+    def endpoint_to_regular_event(
+        self,
+        endpoint: mlrun.common.schemas.ModelEndpoint,
+        policy: dict,
+        applications_names: set,
+        schedule_file: schedules.ModelMonitoringSchedulesFileChief,
+    ) -> None:
+        if self._should_monitor_endpoint(
+            endpoint,
+            set(applications_names),
+            policy.get(ControllerEventEndpointPolicy.BASE_PERIOD, 10),
+            schedule_file,
+        ):
+            logger.debug(
+                "Endpoint data is being prepared for regular event",
+                endpoint_id=endpoint.metadata.uid,
+                endpoint_name=endpoint.metadata.name,
+                timestamp=endpoint.status.last_request.isoformat(
+                    sep=" ", timespec="microseconds"
+                ),
+                first_request=endpoint.status.first_request.isoformat(
+                    sep=" ", timespec="microseconds"
+                ),
+                endpoint_type=endpoint.metadata.endpoint_type,
+                feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
+                endpoint_policy=json.dumps(policy),
+            )
+            policy[ControllerEventEndpointPolicy.ENDPOINT_UPDATED] = (
+                endpoint.metadata.updated.isoformat()
             )
-            get_stream_pusher(stream_uri, access_key=model_monitoring_access_key).push(
-                [data]
+            self.push_to_controller_stream(
+                kind=mm_constants.ControllerEventKind.REGULAR_EVENT,
+                project=endpoint.metadata.project,
+                endpoint_id=endpoint.metadata.uid,
+                endpoint_name=endpoint.metadata.name,
+                timestamp=endpoint.status.last_request.isoformat(
+                    sep=" ", timespec="microseconds"
+                ),
+                first_request=endpoint.status.first_request.isoformat(
+                    sep=" ", timespec="microseconds"
+                ),
+                endpoint_type=endpoint.metadata.endpoint_type.value,
+                feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
+                endpoint_policy=policy,
             )
+    def push_to_controller_stream(
+        self,
+        kind: str,
+        project: str,
+        endpoint_id: str,
+        endpoint_name: str,
+        timestamp: str,
+        first_request: str,
+        endpoint_type: int,
+        feature_set_uri: str,
+        endpoint_policy: dict[str, Any],
+    ) -> None:
+        """
+        Pushes event data to controller stream.
+        :param timestamp: the event timestamp str isoformat utc timezone
+        :param first_request: the first request str isoformat utc timezone
+        :param endpoint_policy: dictionary hold the monitoring policy
+        :param kind: str event kind
+        :param project: project name
+        :param endpoint_id: endpoint id string
+        :param endpoint_name: the endpoint name string
+        :param endpoint_type: Enum of the endpoint type
+        :param feature_set_uri: the feature set uri string
+        """
+        event = {
+            ControllerEvent.KIND.value: kind,
+            ControllerEvent.PROJECT.value: project,
+            ControllerEvent.ENDPOINT_ID.value: endpoint_id,
+            ControllerEvent.ENDPOINT_NAME.value: endpoint_name,
+            ControllerEvent.TIMESTAMP.value: timestamp,
+            ControllerEvent.FIRST_REQUEST.value: first_request,
+            ControllerEvent.ENDPOINT_TYPE.value: endpoint_type,
+            ControllerEvent.FEATURE_SET_URI.value: feature_set_uri,
+            ControllerEvent.ENDPOINT_POLICY.value: endpoint_policy,
+        }
+        logger.info(
+            "Pushing data to controller stream",
+            event=event,
+            endpoint_id=endpoint_id,
+            controller_stream_type=str(type(self.controller_stream)),
+        )
+        self.controller_stream.push([event], partition_key=endpoint_id)
+    def _push_to_main_stream(self, event: dict, endpoint_id: str) -> None:
+        """
+        Pushes the given event to model monitoring stream
+        :param event: event dictionary to push to stream
+        :param endpoint_id: endpoint id string
+        """
+        logger.info(
+            "Pushing data to main stream, NOP event is been generated",
+            event=json.dumps(event),
+            endpoint_id=endpoint_id,
+            mm_stream_type=str(type(self.model_monitoring_stream)),
+        )
+        self.model_monitoring_stream.push([event], partition_key=endpoint_id)
-def handler(context: nuclio.Context, event: nuclio.Event) -> None:
+def handler(context: nuclio_sdk.Context, event: nuclio_sdk.Event) -> None:
     """
     Run model monitoring application processor
     :param context: the Nuclio context
     :param event:   trigger event
     """
-    MonitoringApplicationController().run()
+    logger.info(
+        "Controller got event",
+        trigger=event.trigger,
+        trigger_kind=event.trigger.kind,
+    )
+    if event.trigger.kind in mm_constants.CRON_TRIGGER_KINDS:
+        # Runs controller chief:
+        context.user_data.monitor_app_controller.push_regular_event_to_controller_stream()
+    elif event.trigger.kind in mm_constants.STREAM_TRIGGER_KINDS:
+        # Runs controller worker:
+        context.user_data.monitor_app_controller.run(event)
+    else:
+        raise mlrun.errors.MLRunInvalidArgumentError(
+            "Wrong trigger kind for model monitoring controller"
+        )
+def init_context(context):
+    monitor_app_controller = MonitoringApplicationController()
+    setattr(context.user_data, "monitor_app_controller", monitor_app_controller)
+    context.logger.info("Monitoring application controller initialized")

mlrun 1.7.2rc3__py3-none-any.whl → 1.8.0__py3-none-any.whl

Potentially problematic release.

mlrun 1.7.2rc3py3-none-any.whl → 1.8.0py3-none-any.whl