PyPI - mlrun - Versions diffs - 1.8.0rc19__py3-none-any.whl → 1.8.0rc26__py3-none-any.whl - Mend

mlrun 1.8.0rc19py3-none-any.whl → 1.8.0rc26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (52) hide show

mlrun/__init__.py +37 -3
mlrun/__main__.py +5 -0
mlrun/alerts/alert.py +1 -0
mlrun/artifacts/document.py +78 -36
mlrun/common/formatters/feature_set.py +1 -0
mlrun/common/runtimes/constants.py +17 -0
mlrun/common/schemas/alert.py +3 -0
mlrun/common/schemas/client_spec.py +0 -1
mlrun/common/schemas/model_monitoring/constants.py +32 -9
mlrun/common/schemas/model_monitoring/model_endpoints.py +2 -0
mlrun/common/schemas/workflow.py +1 -0
mlrun/config.py +39 -6
mlrun/datastore/datastore_profile.py +58 -16
mlrun/datastore/sources.py +7 -1
mlrun/datastore/vectorstore.py +20 -1
mlrun/db/base.py +20 -0
mlrun/db/httpdb.py +97 -10
mlrun/db/nopdb.py +19 -0
mlrun/errors.py +4 -0
mlrun/execution.py +15 -6
mlrun/frameworks/_common/model_handler.py +0 -2
mlrun/launcher/client.py +2 -2
mlrun/launcher/local.py +5 -1
mlrun/model_monitoring/applications/_application_steps.py +3 -1
mlrun/model_monitoring/controller.py +266 -103
mlrun/model_monitoring/db/tsdb/__init__.py +11 -23
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +2 -0
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +20 -21
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +35 -34
mlrun/model_monitoring/helpers.py +16 -10
mlrun/model_monitoring/stream_processing.py +106 -35
mlrun/package/context_handler.py +1 -1
mlrun/package/packagers_manager.py +4 -18
mlrun/projects/pipelines.py +18 -5
mlrun/projects/project.py +156 -39
mlrun/runtimes/nuclio/serving.py +22 -13
mlrun/runtimes/sparkjob/spark3job.py +1 -1
mlrun/secrets.py +1 -1
mlrun/serving/server.py +11 -3
mlrun/serving/states.py +65 -8
mlrun/serving/v2_serving.py +67 -44
mlrun/utils/helpers.py +111 -23
mlrun/utils/notifications/notification/base.py +6 -1
mlrun/utils/notifications/notification/slack.py +5 -1
mlrun/utils/notifications/notification_pusher.py +67 -36
mlrun/utils/version/version.json +2 -2
{mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/METADATA +33 -16
{mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/RECORD +52 -52
{mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/WHEEL +1 -1
{mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/LICENSE +0 -0
{mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/entry_points.txt +0 -0
{mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/top_level.txt +0 -0

mlrun/model_monitoring/controller.py CHANGED Viewed

@@ -12,14 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import concurrent.futures
 import datetime
 import json
 import os
 from collections.abc import Iterator
 from contextlib import AbstractContextManager
 from types import TracebackType
-from typing import NamedTuple, Optional, cast
+from typing import Any, NamedTuple, Optional, cast
 import nuclio_sdk
@@ -28,6 +27,10 @@ import mlrun.common.schemas.model_monitoring.constants as mm_constants
 import mlrun.feature_store as fstore
 import mlrun.model_monitoring
 from mlrun.common.schemas import EndpointType
+from mlrun.common.schemas.model_monitoring.constants import (
+    ControllerEvent,
+    ControllerEventKind,
+)
 from mlrun.datastore import get_stream_pusher
 from mlrun.errors import err_to_str
 from mlrun.model_monitoring.db._schedules import ModelMonitoringSchedulesFile
@@ -140,6 +143,7 @@ class _BatchWindowGenerator(AbstractContextManager):
         Initialize a batch window generator object that generates batch window objects
         for the monitoring functions.
         """
+        self.batch_window: _BatchWindow = None
         self._project = project
         self._endpoint_id = endpoint_id
         self._timedelta = window_length
@@ -199,14 +203,14 @@ class _BatchWindowGenerator(AbstractContextManager):
         `first_request` and `last_request` are the timestamps of the first request and last
         request to the endpoint, respectively. They are guaranteed to be nonempty at this point.
         """
-        batch_window = _BatchWindow(
+        self.batch_window = _BatchWindow(
             schedules_file=self._schedules_file,
             application=application,
             timedelta_seconds=self._timedelta,
             last_updated=self._get_last_updated_time(last_request, not_batch_endpoint),
             first_request=int(first_request.timestamp()),
         )
-        yield from batch_window.get_intervals()
+        yield from self.batch_window.get_intervals()
 def _get_window_length() -> int:
@@ -237,6 +241,7 @@ class MonitoringApplicationController:
         self._window_length = _get_window_length()
         self.model_monitoring_access_key = self._get_model_monitoring_access_key()
+        self.v3io_access_key = mlrun.get_secret_or_env("V3IO_ACCESS_KEY")
         self.storage_options = None
         if mlrun.mlconf.artifact_path.startswith("s3://"):
             self.storage_options = mlrun.mlconf.get_s3_storage_options()
@@ -262,112 +267,65 @@ class MonitoringApplicationController:
             != mm_constants.EndpointType.ROUTER.value
         )
-    def run(self) -> None:
+    def run(self, event: nuclio_sdk.Event) -> None:
         """
-        Main method for run all the relevant monitoring applications on each endpoint.
+        Main method for controller chief, runs all the relevant monitoring applications for a single endpoint.
+        Handles nop events logic.
         This method handles the following:
-        1. List model endpoints
-        2. List applications
-        3. Check model monitoring windows
-        4. Send data to applications
-        5. Delete old parquets
+        1. Read applications from the event (endpoint_policy)
+        2. Check model monitoring windows
+        3. Send data to applications
+        4. Pushes nop event to main stream if needed
         """
-        logger.info("Start running monitoring controller")
+        logger.info("Start running monitoring controller worker")
         try:
-            applications_names = []
-            endpoints_list = mlrun.db.get_run_db().list_model_endpoints(
-                project=self.project, tsdb_metrics=True
-            )
-            endpoints = endpoints_list.endpoints
-            if not endpoints:
-                logger.info("No model endpoints found", project=self.project)
-                return
-            monitoring_functions = self.project_obj.list_model_monitoring_functions()
-            if monitoring_functions:
-                applications_names = list(
-                    {app.metadata.name for app in monitoring_functions}
-                )
-            # if monitoring_functions: - TODO : ML-7700
-            #   Gets only application in ready state
-            #   applications_names = list(
-            #       {
-            #           app.metadata.name
-            #           for app in monitoring_functions
-            #           if (
-            #               app.status.state == "ready"
-            #               # workaround for the default app, as its `status.state` is `None`
-            #               or app.metadata.name
-            #               == mm_constants.HistogramDataDriftApplicationConstants.NAME
-            #           )
-            #       }
-            #   )
-            if not applications_names:
-                logger.info("No monitoring functions found", project=self.project)
-                return
-            logger.info(
-                "Starting to iterate over the applications",
-                applications=applications_names,
-            )
+            body = json.loads(event.body.decode("utf-8"))
         except Exception as e:
             logger.error(
-                "Failed to list endpoints and monitoring applications",
+                "Failed to decode event",
                 exc=err_to_str(e),
             )
             return
-        # Initialize a thread pool that will be used to monitor each endpoint on a dedicated thread
-        with concurrent.futures.ThreadPoolExecutor(
-            max_workers=min(len(endpoints), 10)
-        ) as pool:
-            for endpoint in endpoints:
-                if self._should_monitor_endpoint(endpoint):
-                    pool.submit(
-                        MonitoringApplicationController.model_endpoint_process,
-                        project=self.project,
-                        endpoint=endpoint,
-                        applications_names=applications_names,
-                        window_length=self._window_length,
-                        model_monitoring_access_key=self.model_monitoring_access_key,
-                        storage_options=self.storage_options,
-                    )
-                else:
-                    logger.debug(
-                        "Skipping endpoint, not ready or not suitable for monitoring",
-                        endpoint_id=endpoint.metadata.uid,
-                        endpoint_name=endpoint.metadata.name,
-                    )
-        logger.info("Finished running monitoring controller")
+        # Run single endpoint process
+        self.model_endpoint_process(event=body)
-    @classmethod
     def model_endpoint_process(
-        cls,
-        project: str,
-        endpoint: mlrun.common.schemas.ModelEndpoint,
-        applications_names: list[str],
-        window_length: int,
-        model_monitoring_access_key: str,
-        storage_options: Optional[dict] = None,
+        self,
+        event: Optional[dict] = None,
     ) -> None:
         """
         Process a model endpoint and trigger the monitoring applications. This function running on different process
-        for each endpoint. In addition, this function will generate a parquet file that includes the relevant data
-        for a specific time range.
-        :param endpoint:                    (dict) Model endpoint record.
-        :param applications_names:          (list[str]) List of application names to push results to.
-        :param batch_window_generator:      (_BatchWindowGenerator) An object that generates _BatchWindow objects.
-        :param project:                     (str) Project name.
-        :param model_monitoring_access_key: (str) Access key to apply the model monitoring process.
-        :param storage_options:             (dict) Storage options for reading the infer parquet files.
+        for each endpoint.
+        :param event:                       (dict) Event that triggered the monitoring process.
         """
-        endpoint_id = endpoint.metadata.uid
-        not_batch_endpoint = not (
-            endpoint.metadata.endpoint_type == EndpointType.BATCH_EP
-        )
-        m_fs = fstore.get_feature_set(endpoint.spec.monitoring_feature_set_uri)
+        logger.info("Model endpoint process started", event=event)
         try:
+            project_name = event[ControllerEvent.PROJECT]
+            endpoint_id = event[ControllerEvent.ENDPOINT_ID]
+            endpoint_name = event[ControllerEvent.ENDPOINT_NAME]
+            applications_names = event[ControllerEvent.ENDPOINT_POLICY][
+                "monitoring_applications"
+            ]
+            not_batch_endpoint = (
+                event[ControllerEvent.ENDPOINT_POLICY] != EndpointType.BATCH_EP
+            )
+            m_fs = fstore.get_feature_set(event[ControllerEvent.FEATURE_SET_URI])
+            logger.info(
+                "Starting analyzing for:", timestamp=event[ControllerEvent.TIMESTAMP]
+            )
+            last_stream_timestamp = datetime.datetime.fromisoformat(
+                event[ControllerEvent.TIMESTAMP]
+            )
+            first_request = datetime.datetime.fromisoformat(
+                event[ControllerEvent.FIRST_REQUEST]
+            )
             with _BatchWindowGenerator(
-                project=project, endpoint_id=endpoint_id, window_length=window_length
+                project=project_name,
+                endpoint_id=endpoint_id,
+                window_length=self._window_length,
             ) as batch_window_generator:
                 for application in applications_names:
                     for (
@@ -375,15 +333,15 @@ class MonitoringApplicationController:
                         end_infer_time,
                     ) in batch_window_generator.get_intervals(
                         application=application,
-                        first_request=endpoint.status.first_request,
-                        last_request=endpoint.status.last_request,
                         not_batch_endpoint=not_batch_endpoint,
+                        first_request=first_request,
+                        last_request=last_stream_timestamp,
                     ):
                         df = m_fs.to_dataframe(
                             start_time=start_infer_time,
                             end_time=end_infer_time,
                             time_column=mm_constants.EventFieldType.TIMESTAMP,
-                            storage_options=storage_options,
+                            storage_options=self.storage_options,
                         )
                         if len(df) == 0:
                             logger.info(
@@ -399,21 +357,53 @@ class MonitoringApplicationController:
                                 end=end_infer_time,
                                 endpoint_id=endpoint_id,
                             )
-                            cls._push_to_applications(
+                            self._push_to_applications(
                                 start_infer_time=start_infer_time,
                                 end_infer_time=end_infer_time,
                                 endpoint_id=endpoint_id,
-                                endpoint_name=endpoint.metadata.name,
-                                project=project,
+                                endpoint_name=endpoint_name,
+                                project=project_name,
                                 applications_names=[application],
-                                model_monitoring_access_key=model_monitoring_access_key,
+                                model_monitoring_access_key=self.model_monitoring_access_key,
                             )
-                logger.info("Finished processing endpoint", endpoint_id=endpoint_id)
+                base_period = event[ControllerEvent.ENDPOINT_POLICY]["base_period"]
+                current_time = mlrun.utils.datetime_now()
+                if (
+                    current_time.timestamp()
+                    - batch_window_generator.batch_window._get_last_analyzed()
+                    >= datetime.timedelta(minutes=base_period).total_seconds()
+                    and event[ControllerEvent.KIND] != ControllerEventKind.NOP_EVENT
+                ):
+                    event = {
+                        ControllerEvent.KIND: mm_constants.ControllerEventKind.NOP_EVENT,
+                        ControllerEvent.PROJECT: project_name,
+                        ControllerEvent.ENDPOINT_ID: endpoint_id,
+                        ControllerEvent.ENDPOINT_NAME: endpoint_name,
+                        ControllerEvent.TIMESTAMP: current_time.isoformat(
+                            timespec="microseconds"
+                        ),
+                        ControllerEvent.ENDPOINT_POLICY: event[
+                            ControllerEvent.ENDPOINT_POLICY
+                        ],
+                        ControllerEvent.ENDPOINT_TYPE: event[
+                            ControllerEvent.ENDPOINT_TYPE
+                        ],
+                        ControllerEvent.FEATURE_SET_URI: event[
+                            ControllerEvent.FEATURE_SET_URI
+                        ],
+                        ControllerEvent.FIRST_REQUEST: event[
+                            ControllerEvent.FIRST_REQUEST
+                        ],
+                    }
+                    self._push_to_main_stream(
+                        event=event,
+                        endpoint_id=endpoint_id,
+                    )
         except Exception:
             logger.exception(
                 "Encountered an exception",
-                endpoint_id=endpoint.metadata.uid,
+                endpoint_id=event[ControllerEvent.ENDPOINT_ID],
             )
     @staticmethod
@@ -465,6 +455,168 @@ class MonitoringApplicationController:
                 [data]
             )
+    def push_regular_event_to_controller_stream(self, event: nuclio_sdk.Event) -> None:
+        """
+        pushes a regular event to the controller stream.
+        :param event: the nuclio trigger event
+        """
+        logger.info("Starting monitoring controller chief")
+        applications_names = []
+        db = mlrun.get_run_db()
+        endpoints = db.list_model_endpoints(
+            project=self.project, tsdb_metrics=True
+        ).endpoints
+        if not endpoints:
+            logger.info("No model endpoints found", project=self.project)
+            return
+        monitoring_functions = self.project_obj.list_model_monitoring_functions()
+        if monitoring_functions:
+            # if monitoring_functions: - TODO : ML-7700
+            #   Gets only application in ready state
+            #   applications_names = list(
+            #       {
+            #           app.metadata.name
+            #           for app in monitoring_functions
+            #           if (
+            #               app.status.state == "ready"
+            #               # workaround for the default app, as its `status.state` is `None`
+            #               or app.metadata.name
+            #               == mm_constants.HistogramDataDriftApplicationConstants.NAME
+            #           )
+            #       }
+            #   )
+            applications_names = list(
+                {app.metadata.name for app in monitoring_functions}
+            )
+        if not applications_names:
+            logger.info("No monitoring functions found", project=self.project)
+            return
+        policy = {
+            "monitoring_applications": applications_names,
+            "base_period": int(
+                batch_dict2timedelta(
+                    json.loads(
+                        cast(
+                            str,
+                            os.getenv(mm_constants.EventFieldType.BATCH_INTERVALS_DICT),
+                        )
+                    )
+                ).total_seconds()
+                // 60
+            ),
+        }
+        for endpoint in endpoints:
+            if self._should_monitor_endpoint(endpoint):
+                logger.info(
+                    "Regular event is being pushed to controller stream for model endpoint",
+                    endpoint_id=endpoint.metadata.uid,
+                    endpoint_name=endpoint.metadata.name,
+                    timestamp=endpoint.status.last_request.isoformat(
+                        sep=" ", timespec="microseconds"
+                    ),
+                    first_request=endpoint.status.first_request.isoformat(
+                        sep=" ", timespec="microseconds"
+                    ),
+                    endpoint_type=endpoint.metadata.endpoint_type,
+                    feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
+                    endpoint_policy=json.dumps(policy),
+                )
+                self.push_to_controller_stream(
+                    kind=mm_constants.ControllerEventKind.REGULAR_EVENT,
+                    project=self.project,
+                    endpoint_id=endpoint.metadata.uid,
+                    endpoint_name=endpoint.metadata.name,
+                    stream_access_key=self.v3io_access_key,
+                    timestamp=endpoint.status.last_request.isoformat(
+                        sep=" ", timespec="microseconds"
+                    ),
+                    first_request=endpoint.status.first_request.isoformat(
+                        sep=" ", timespec="microseconds"
+                    ),
+                    endpoint_type=endpoint.metadata.endpoint_type,
+                    feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
+                    endpoint_policy=policy,
+                )
+            else:
+                logger.info(
+                    "Should not monitor model endpoint, didn't push regular event",
+                    endpoint_id=endpoint.metadata.uid,
+                    endpoint_name=endpoint.metadata.name,
+                    timestamp=endpoint.status.last_request,
+                    first_request=endpoint.status.first_request,
+                    endpoint_type=endpoint.metadata.endpoint_type,
+                    feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
+                )
+    @staticmethod
+    def push_to_controller_stream(
+        kind: str,
+        project: str,
+        endpoint_id: str,
+        endpoint_name: str,
+        stream_access_key: str,
+        timestamp: str,
+        first_request: str,
+        endpoint_type: str,
+        feature_set_uri: str,
+        endpoint_policy: dict[str, Any],
+    ) -> None:
+        """
+        Pushes event data to controller stream.
+        :param timestamp: the event timestamp str isoformat utc timezone
+        :param first_request: the first request str isoformat utc timezone
+        :param endpoint_policy: dictionary hold the monitoring policy
+        :param kind: str event kind
+        :param project: project name
+        :param endpoint_id: endpoint id string
+        :param endpoint_name: the endpoint name string
+        :param endpoint_type: Enum of the endpoint type
+        :param feature_set_uri: the feature set uri string
+        :param stream_access_key: access key to apply the model monitoring process.
+        """
+        stream_uri = get_stream_path(
+            project=project,
+            function_name=mm_constants.MonitoringFunctionNames.APPLICATION_CONTROLLER,
+        )
+        event = {
+            ControllerEvent.KIND.value: kind,
+            ControllerEvent.PROJECT.value: project,
+            ControllerEvent.ENDPOINT_ID.value: endpoint_id,
+            ControllerEvent.ENDPOINT_NAME.value: endpoint_name,
+            ControllerEvent.TIMESTAMP.value: timestamp,
+            ControllerEvent.FIRST_REQUEST.value: first_request,
+            ControllerEvent.ENDPOINT_TYPE.value: endpoint_type,
+            ControllerEvent.FEATURE_SET_URI.value: feature_set_uri,
+            ControllerEvent.ENDPOINT_POLICY.value: endpoint_policy,
+        }
+        logger.info(
+            "Pushing data to controller stream",
+            event=event,
+            endpoint_id=endpoint_id,
+            stream_uri=stream_uri,
+        )
+        get_stream_pusher(stream_uri, access_key=stream_access_key).push(
+            [event], partition_key=endpoint_id
+        )
+    def _push_to_main_stream(self, event: dict, endpoint_id: str) -> None:
+        """
+        Pushes the given event to model monitoring stream
+        :param event: event dictionary to push to stream
+        :param endpoint_id: endpoint id string
+        """
+        stream_uri = get_stream_path(project=event.get(ControllerEvent.PROJECT))
+        logger.info(
+            "Pushing data to main stream, NOP event is been generated",
+            event=json.dumps(event),
+            endpoint_id=endpoint_id,
+            stream_uri=stream_uri,
+        )
+        get_stream_pusher(stream_uri, access_key=self.model_monitoring_access_key).push(
+            [event], partition_key=endpoint_id
+        )
 def handler(context: nuclio_sdk.Context, event: nuclio_sdk.Event) -> None:
     """
@@ -473,4 +625,15 @@ def handler(context: nuclio_sdk.Context, event: nuclio_sdk.Event) -> None:
     :param context: the Nuclio context
     :param event:   trigger event
     """
-    MonitoringApplicationController().run()
+    logger.info(
+        "Controller got event",
+        trigger=event.trigger,
+        trigger_kind=event.trigger.kind,
+    )
+    if event.trigger.kind == "http":
+        # Runs controller chief:
+        MonitoringApplicationController().push_regular_event_to_controller_stream(event)
+    else:
+        # Runs controller worker:
+        MonitoringApplicationController().run(event=event)

mlrun/model_monitoring/db/tsdb/__init__.py CHANGED Viewed

@@ -67,43 +67,31 @@ class ObjectTSDBFactory(enum.Enum):
 def get_tsdb_connector(
     project: str,
     secret_provider: typing.Optional[typing.Callable[[str], str]] = None,
-    tsdb_connection_string: typing.Optional[str] = None,
-    **kwargs,
+    profile: typing.Optional[mlrun.datastore.datastore_profile.DatastoreProfile] = None,
 ) -> TSDBConnector:
     """
     Get TSDB connector object.
     :param project:                 The name of the project.
     :param secret_provider:         An optional secret provider to get the connection string secret.
-    :param tsdb_connection_string:  An optional explicit connection string to the TSDB.
+    :param profile:                 An optional profile to initialize the TSDB connector from.
     :return: `TSDBConnector` object. The main goal of this object is to handle different operations on the
              TSDB connector such as updating drift metrics or write application record result.
     :raise: `MLRunInvalidMMStoreTypeError` if the user didn't provide TSDB connection
             or the provided TSDB connection is invalid.
     """
-    try:
-        profile = mlrun.model_monitoring.helpers._get_tsdb_profile(
-            project=project, secret_provider=secret_provider
-        )
-    except mlrun.errors.MLRunNotFoundError:
-        profile = None
-    tsdb_connection_string = (
-        tsdb_connection_string
-        or mlrun.model_monitoring.helpers.get_tsdb_connection_string(
-            secret_provider=secret_provider
-        )
+    profile = profile or mlrun.model_monitoring.helpers._get_tsdb_profile(
+        project=project, secret_provider=secret_provider
     )
-    if tsdb_connection_string and tsdb_connection_string.startswith("taosws"):
-        tsdb_connector_type = mlrun.common.schemas.model_monitoring.TSDBTarget.TDEngine
-        kwargs["connection_string"] = tsdb_connection_string
-    elif tsdb_connection_string and tsdb_connection_string == "v3io":
-        tsdb_connector_type = mlrun.common.schemas.model_monitoring.TSDBTarget.V3IO_TSDB
-    elif isinstance(profile, mlrun.datastore.datastore_profile.DatastoreProfileV3io):
+    kwargs = {}
+    if isinstance(profile, mlrun.datastore.datastore_profile.DatastoreProfileV3io):
         tsdb_connector_type = mlrun.common.schemas.model_monitoring.TSDBTarget.V3IO_TSDB
         kwargs["v3io_access_key"] = profile.v3io_access_key
+    elif isinstance(
+        profile, mlrun.datastore.datastore_profile.TDEngineDatastoreProfile
+    ):
+        tsdb_connector_type = mlrun.common.schemas.model_monitoring.TSDBTarget.TDEngine
+        kwargs["connection_string"] = profile.dsn()
     else:
         raise mlrun.errors.MLRunInvalidMMStoreTypeError(
             "You must provide a valid tsdb store connection by using "

mlrun/model_monitoring/db/tsdb/tdengine/schemas.py CHANGED Viewed

@@ -298,6 +298,8 @@ class Predictions(TDEngineSchema):
             mm_schemas.EventFieldType.TIME: _TDEngineColumn.TIMESTAMP,
             mm_schemas.EventFieldType.LATENCY: _TDEngineColumn.FLOAT,
             mm_schemas.EventKeyMetrics.CUSTOM_METRICS: _TDEngineColumn.BINARY_1000,
+            mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT: _TDEngineColumn.FLOAT,
+            mm_schemas.EventFieldType.EFFECTIVE_SAMPLE_COUNT: _TDEngineColumn.INT,
         }
         tags = {
             mm_schemas.WriterEvent.ENDPOINT_ID: _TDEngineColumn.BINARY_64,

mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py CHANGED Viewed

@@ -145,8 +145,11 @@ class TDEngineConnector(TSDBConnector):
         create_table_sql = table._create_subtable_sql(subtable=table_name, values=event)
+        # we need the string values to be sent to the connection, not the enum
+        columns = {str(key): str(val) for key, val in table.columns.items()}
         insert_statement = Statement(
-            columns=table.columns,
+            columns=columns,
             subtable=table_name,
             values=event,
         )
@@ -165,7 +168,7 @@ class TDEngineConnector(TSDBConnector):
         return datetime.fromisoformat(val) if isinstance(val, str) else val
     @staticmethod
-    def _get_endpoint_filter(endpoint_id: typing.Union[str, list[str]]):
+    def _get_endpoint_filter(endpoint_id: typing.Union[str, list[str]]) -> str:
         if isinstance(endpoint_id, str):
             return f"endpoint_id='{endpoint_id}'"
         elif isinstance(endpoint_id, list):
@@ -188,7 +191,7 @@ class TDEngineConnector(TSDBConnector):
             graph.add_step(
                 "mlrun.model_monitoring.db.tsdb.tdengine.stream_graph_steps.ProcessBeforeTDEngine",
                 name="ProcessBeforeTDEngine",
-                after="MapFeatureNames",
+                after="FilterNOP",
             )
         def apply_tdengine_target(name, after):
@@ -206,6 +209,8 @@ class TDEngineConnector(TSDBConnector):
                 columns=[
                     mm_schemas.EventFieldType.LATENCY,
                     mm_schemas.EventKeyMetrics.CUSTOM_METRICS,
+                    mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT,
+                    mm_schemas.EventFieldType.EFFECTIVE_SAMPLE_COUNT,
                 ],
                 tag_cols=[
                     mm_schemas.EventFieldType.ENDPOINT_ID,
@@ -483,7 +488,7 @@ class TDEngineConnector(TSDBConnector):
             table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
             start=start,
             end=end,
-            columns=[mm_schemas.EventFieldType.LATENCY],
+            columns=[mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT],
             filter_query=f"endpoint_id='{endpoint_id}'",
             agg_funcs=agg_funcs,
             interval=aggregation_window,
@@ -503,10 +508,10 @@ class TDEngineConnector(TSDBConnector):
             df["_wend"] = pd.to_datetime(df["_wend"])
             df.set_index("_wend", inplace=True)
-        latency_column = (
-            f"{agg_funcs[0]}({mm_schemas.EventFieldType.LATENCY})"
+        estimated_prediction_count = (
+            f"{agg_funcs[0]}({mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT})"
             if agg_funcs
-            else mm_schemas.EventFieldType.LATENCY
+            else mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT
         )
         return mm_schemas.ModelEndpointMonitoringMetricValues(
@@ -514,7 +519,7 @@ class TDEngineConnector(TSDBConnector):
             values=list(
                 zip(
                     df.index,
-                    df[latency_column],
+                    df[estimated_prediction_count],
                 )
             ),  # pyright: ignore[reportArgumentType]
         )
@@ -525,9 +530,7 @@ class TDEngineConnector(TSDBConnector):
         start: typing.Optional[datetime] = None,
         end: typing.Optional[datetime] = None,
     ) -> pd.DataFrame:
-        endpoint_ids = (
-            endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
-        )
+        filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
         start, end = self._get_start_end(start, end)
         df = self._get_records(
             table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
@@ -538,7 +541,7 @@ class TDEngineConnector(TSDBConnector):
                 mm_schemas.EventFieldType.TIME,
                 mm_schemas.EventFieldType.LATENCY,
             ],
-            filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
+            filter_query=filter_query,
             timestamp_column=mm_schemas.EventFieldType.TIME,
             agg_funcs=["last"],
             group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
@@ -567,9 +570,7 @@ class TDEngineConnector(TSDBConnector):
         start: typing.Optional[datetime] = None,
         end: typing.Optional[datetime] = None,
     ) -> pd.DataFrame:
-        endpoint_ids = (
-            endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
-        )
+        filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
         start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
         start, end = self._get_start_end(start, end)
         df = self._get_records(
@@ -580,7 +581,7 @@ class TDEngineConnector(TSDBConnector):
                 mm_schemas.ResultData.RESULT_STATUS,
                 mm_schemas.EventFieldType.ENDPOINT_ID,
             ],
-            filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
+            filter_query=filter_query,
             timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
             agg_funcs=["max"],
             group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
@@ -678,9 +679,8 @@ class TDEngineConnector(TSDBConnector):
         start: typing.Optional[datetime] = None,
         end: typing.Optional[datetime] = None,
     ) -> pd.DataFrame:
-        endpoint_ids = (
-            endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
-        )
+        filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
+        filter_query += f"AND {mm_schemas.EventFieldType.ERROR_TYPE} = '{mm_schemas.EventFieldType.INFER_ERROR}'"
         start, end = self._get_start_end(start, end)
         df = self._get_records(
             table=self.tables[mm_schemas.TDEngineSuperTables.ERRORS].super_table,
@@ -691,8 +691,7 @@ class TDEngineConnector(TSDBConnector):
                 mm_schemas.EventFieldType.ENDPOINT_ID,
             ],
             agg_funcs=["count"],
-            filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]}) "
-            f"AND {mm_schemas.EventFieldType.ERROR_TYPE} = '{mm_schemas.EventFieldType.INFER_ERROR}'",
+            filter_query=filter_query,
             group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
             preform_agg_columns=[mm_schemas.EventFieldType.MODEL_ERROR],
         )

mlrun 1.8.0rc19__py3-none-any.whl → 1.8.0rc26__py3-none-any.whl

Potentially problematic release.

mlrun 1.8.0rc19py3-none-any.whl → 1.8.0rc26py3-none-any.whl