PyPI - mlrun - Versions diffs - 1.10.0rc13__py3-none-any.whl → 1.10.0rc42__py3-none-any.whl - Mend

mlrun 1.10.0rc13py3-none-any.whl → 1.10.0rc42py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (107) hide show

mlrun/__init__.py +22 -2
mlrun/artifacts/base.py +0 -31
mlrun/artifacts/document.py +6 -1
mlrun/artifacts/llm_prompt.py +123 -25
mlrun/artifacts/manager.py +0 -5
mlrun/artifacts/model.py +3 -3
mlrun/common/constants.py +10 -1
mlrun/common/formatters/artifact.py +1 -0
mlrun/common/model_monitoring/helpers.py +86 -0
mlrun/common/schemas/__init__.py +3 -0
mlrun/common/schemas/auth.py +2 -0
mlrun/common/schemas/function.py +10 -0
mlrun/common/schemas/hub.py +30 -18
mlrun/common/schemas/model_monitoring/__init__.py +3 -0
mlrun/common/schemas/model_monitoring/constants.py +30 -6
mlrun/common/schemas/model_monitoring/functions.py +14 -5
mlrun/common/schemas/model_monitoring/model_endpoints.py +21 -0
mlrun/common/schemas/pipeline.py +1 -1
mlrun/common/schemas/serving.py +3 -0
mlrun/common/schemas/workflow.py +3 -1
mlrun/common/secrets.py +22 -1
mlrun/config.py +33 -11
mlrun/datastore/__init__.py +11 -3
mlrun/datastore/azure_blob.py +162 -47
mlrun/datastore/datastore.py +9 -4
mlrun/datastore/datastore_profile.py +61 -5
mlrun/datastore/model_provider/huggingface_provider.py +363 -0
mlrun/datastore/model_provider/mock_model_provider.py +87 -0
mlrun/datastore/model_provider/model_provider.py +230 -65
mlrun/datastore/model_provider/openai_provider.py +295 -42
mlrun/datastore/s3.py +24 -2
mlrun/datastore/storeytargets.py +2 -3
mlrun/datastore/utils.py +15 -3
mlrun/db/base.py +47 -19
mlrun/db/httpdb.py +120 -56
mlrun/db/nopdb.py +38 -10
mlrun/execution.py +70 -19
mlrun/hub/__init__.py +15 -0
mlrun/hub/module.py +181 -0
mlrun/k8s_utils.py +105 -16
mlrun/launcher/base.py +13 -6
mlrun/launcher/local.py +15 -0
mlrun/model.py +24 -3
mlrun/model_monitoring/__init__.py +1 -0
mlrun/model_monitoring/api.py +66 -27
mlrun/model_monitoring/applications/__init__.py +1 -1
mlrun/model_monitoring/applications/base.py +509 -117
mlrun/model_monitoring/applications/context.py +2 -4
mlrun/model_monitoring/applications/results.py +4 -7
mlrun/model_monitoring/controller.py +239 -101
mlrun/model_monitoring/db/_schedules.py +116 -33
mlrun/model_monitoring/db/_stats.py +4 -3
mlrun/model_monitoring/db/tsdb/base.py +100 -9
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +11 -6
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +191 -50
mlrun/model_monitoring/db/tsdb/tdengine/writer_graph_steps.py +51 -0
mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +259 -40
mlrun/model_monitoring/helpers.py +54 -9
mlrun/model_monitoring/stream_processing.py +45 -14
mlrun/model_monitoring/writer.py +220 -1
mlrun/platforms/__init__.py +3 -2
mlrun/platforms/iguazio.py +7 -3
mlrun/projects/operations.py +6 -1
mlrun/projects/pipelines.py +46 -26
mlrun/projects/project.py +166 -58
mlrun/run.py +94 -17
mlrun/runtimes/__init__.py +18 -0
mlrun/runtimes/base.py +14 -6
mlrun/runtimes/daskjob.py +7 -0
mlrun/runtimes/local.py +5 -2
mlrun/runtimes/mounts.py +20 -2
mlrun/runtimes/mpijob/abstract.py +6 -0
mlrun/runtimes/mpijob/v1.py +6 -0
mlrun/runtimes/nuclio/__init__.py +1 -0
mlrun/runtimes/nuclio/application/application.py +149 -17
mlrun/runtimes/nuclio/function.py +76 -27
mlrun/runtimes/nuclio/serving.py +97 -15
mlrun/runtimes/pod.py +234 -21
mlrun/runtimes/remotesparkjob.py +6 -0
mlrun/runtimes/sparkjob/spark3job.py +6 -0
mlrun/runtimes/utils.py +49 -11
mlrun/secrets.py +54 -13
mlrun/serving/__init__.py +2 -0
mlrun/serving/remote.py +79 -6
mlrun/serving/routers.py +23 -41
mlrun/serving/server.py +320 -80
mlrun/serving/states.py +725 -157
mlrun/serving/steps.py +62 -0
mlrun/serving/system_steps.py +200 -119
mlrun/serving/v2_serving.py +9 -10
mlrun/utils/helpers.py +288 -88
mlrun/utils/logger.py +3 -1
mlrun/utils/notifications/notification/base.py +18 -0
mlrun/utils/notifications/notification/git.py +2 -4
mlrun/utils/notifications/notification/slack.py +2 -4
mlrun/utils/notifications/notification/webhook.py +2 -5
mlrun/utils/notifications/notification_pusher.py +1 -1
mlrun/utils/retryer.py +15 -2
mlrun/utils/version/version.json +2 -2
{mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/METADATA +45 -51
{mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/RECORD +106 -101
mlrun/api/schemas/__init__.py +0 -259
{mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/WHEEL +0 -0
{mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/entry_points.txt +0 -0
{mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/licenses/LICENSE +0 -0
{mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/top_level.txt +0 -0

mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py CHANGED Viewed

@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
 from io import StringIO
-from typing import Callable, Literal, Optional, Union
+from typing import Literal, Optional, Union
 import pandas as pd
 import v3io_frames
@@ -25,6 +25,7 @@ import mlrun.common.schemas.model_monitoring as mm_schemas
 import mlrun.feature_store.steps
 import mlrun.utils.v3io_clients
 from mlrun.common.schemas import EventFieldType
+from mlrun.config import config
 from mlrun.model_monitoring.db import TSDBConnector
 from mlrun.model_monitoring.helpers import get_invocations_fqn, get_start_end
 from mlrun.utils import logger
@@ -369,6 +370,49 @@ class V3IOTSDBConnector(TSDBConnector):
         apply_storey_filter()
         apply_tsdb_target(name="tsdb3", after="FilterNotNone")
+    def apply_writer_steps(self, graph, after, **kwargs) -> None:
+        graph.add_step(
+            "storey.TSDBTarget",
+            name="tsdb_metrics",
+            after=after,
+            path=f"{self.container}/{self.tables[mm_schemas.V3IOTSDBTables.METRICS]}",
+            time_col=mm_schemas.WriterEvent.END_INFER_TIME,
+            container=self.container,
+            v3io_frames=self.v3io_framesd,
+            infer_columns_from_data=True,
+            graph_shape="cylinder",
+            index_cols=[
+                mm_schemas.WriterEvent.APPLICATION_NAME,
+                mm_schemas.WriterEvent.ENDPOINT_NAME,
+                mm_schemas.WriterEvent.ENDPOINT_ID,
+                mm_schemas.MetricData.METRIC_NAME,
+            ],
+            max_events=config.model_endpoint_monitoring.writer_graph.max_events,
+            flush_after_seconds=config.model_endpoint_monitoring.writer_graph.flush_after_seconds,
+            key=mm_schemas.EventFieldType.ENDPOINT_ID,
+        )
+        graph.add_step(
+            "storey.TSDBTarget",
+            name="tsdb_app_results",
+            after=after,
+            path=f"{self.container}/{self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]}",
+            time_col=mm_schemas.WriterEvent.END_INFER_TIME,
+            container=self.container,
+            v3io_frames=self.v3io_framesd,
+            infer_columns_from_data=True,
+            graph_shape="cylinder",
+            index_cols=[
+                mm_schemas.WriterEvent.APPLICATION_NAME,
+                mm_schemas.WriterEvent.ENDPOINT_NAME,
+                mm_schemas.WriterEvent.ENDPOINT_ID,
+                mm_schemas.ResultData.RESULT_NAME,
+            ],
+            max_events=config.model_endpoint_monitoring.writer_graph.max_events,
+            flush_after_seconds=config.model_endpoint_monitoring.writer_graph.flush_after_seconds,
+            key=mm_schemas.EventFieldType.ENDPOINT_ID,
+        )
     def handle_model_error(
         self,
         graph,
@@ -492,7 +536,8 @@ class V3IOTSDBConnector(TSDBConnector):
         # Split the endpoint ids into chunks to avoid exceeding the v3io-engine filter-expression limit
         for i in range(0, len(endpoint_ids), V3IO_FRAMESD_MEPS_LIMIT):
             endpoint_id_chunk = endpoint_ids[i : i + V3IO_FRAMESD_MEPS_LIMIT]
-            filter_query = f"endpoint_id IN({str(endpoint_id_chunk)[1:-1]}) "
+            endpoints_list = "', '".join(endpoint_id_chunk)
+            filter_query = f"endpoint_id IN('{endpoints_list}')"
             for table in tables:
                 try:
                     self.frames_client.delete(
@@ -532,6 +577,43 @@ class V3IOTSDBConnector(TSDBConnector):
                     project=self.project,
                 )
+    def delete_application_records(
+        self, application_name: str, endpoint_ids: Optional[list[str]] = None
+    ) -> None:
+        """
+        Delete application records from the TSDB for the given model endpoints or all if ``endpoint_ids`` is ``None``.
+        """
+        base_filter_query = f"application_name=='{application_name}'"
+        filter_queries: list[str] = []
+        if endpoint_ids:
+            for i in range(0, len(endpoint_ids), V3IO_FRAMESD_MEPS_LIMIT):
+                endpoint_id_chunk = endpoint_ids[i : i + V3IO_FRAMESD_MEPS_LIMIT]
+                endpoints_list = "', '".join(endpoint_id_chunk)
+                filter_queries.append(
+                    f"{base_filter_query} AND endpoint_id IN ('{endpoints_list}')"
+                )
+        else:
+            filter_queries = [base_filter_query]
+        for table in [
+            self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS],
+            self.tables[mm_schemas.V3IOTSDBTables.METRICS],
+        ]:
+            logger.debug(
+                "Deleting application records from TSDB",
+                table=table,
+                filter_queries=filter_queries,
+                project=self.project,
+            )
+            for filter_query in filter_queries:
+                self.frames_client.delete(
+                    backend=_TSDB_BE,
+                    table=table,
+                    filter=filter_query,
+                    start="0",
+                )
     def get_model_endpoint_real_time_metrics(
         self, endpoint_id: str, metrics: list[str], start: str, end: str
     ) -> dict[str, list[tuple[str, float]]]:
@@ -935,6 +1017,9 @@ class V3IOTSDBConnector(TSDBConnector):
         start: Optional[datetime] = None,
         end: Optional[datetime] = None,
     ) -> dict[str, float]:
+        if not endpoint_ids:
+            return {}
         # Get the last request timestamp for each endpoint from the KV table.
         # The result of the query is a list of dictionaries,
         # each dictionary contains the endpoint id and the last request timestamp.
@@ -1145,11 +1230,9 @@ class V3IOTSDBConnector(TSDBConnector):
             )
         return df.reset_index(drop=True)
-    async def add_basic_metrics(
+    def add_basic_metrics(
         self,
         model_endpoint_objects: list[mlrun.common.schemas.ModelEndpoint],
-        project: str,
-        run_in_threadpool: Callable,
         metric_list: Optional[list[str]] = None,
     ) -> list[mlrun.common.schemas.ModelEndpoint]:
         """
@@ -1157,8 +1240,6 @@ class V3IOTSDBConnector(TSDBConnector):
         :param model_endpoint_objects: A list of `ModelEndpoint` objects that will
                                        be filled with the relevant basic metrics.
-        :param project:                The name of the project.
-        :param run_in_threadpool:      A function that runs another function in a thread pool.
         :param metric_list:            List of metrics to include from the time series DB. Defaults to all metrics.
         :return: A list of `ModelEndpointMonitoringMetric` objects.
@@ -1187,8 +1268,7 @@ class V3IOTSDBConnector(TSDBConnector):
             function,
             _,
         ) in metric_name_to_function_and_column_name.items():
-            metric_name_to_result[metric_name] = await run_in_threadpool(
-                function,
+            metric_name_to_result[metric_name] = function(
                 endpoint_ids=uids,
                 get_raw=True,
             )
@@ -1259,7 +1339,7 @@ class V3IOTSDBConnector(TSDBConnector):
             else:
                 filter_query = app_filter_query
-        df = self._get_records(
+        raw_frames: list[v3io_frames.client.RawFrame] = self._get_records(
             table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
             start=start,
             end=end,
@@ -1268,39 +1348,33 @@ class V3IOTSDBConnector(TSDBConnector):
                 mm_schemas.ResultData.RESULT_STATUS,
             ],
             filter_query=filter_query,
+            get_raw=True,
         )
-        # filter result status
-        if result_status_list and not df.empty:
-            df = df[df[mm_schemas.ResultData.RESULT_STATUS].isin(result_status_list)]
-        if df.empty:
+        if not raw_frames:
             return {}
-        else:
-            # convert application name to lower case
-            df[mm_schemas.ApplicationEvent.APPLICATION_NAME] = df[
-                mm_schemas.ApplicationEvent.APPLICATION_NAME
-            ].str.lower()
-            df = (
-                df[
-                    [
-                        mm_schemas.ApplicationEvent.APPLICATION_NAME,
-                        mm_schemas.ResultData.RESULT_STATUS,
-                        mm_schemas.ResultData.RESULT_VALUE,
-                    ]
-                ]
-                .groupby(
-                    [
-                        mm_schemas.ApplicationEvent.APPLICATION_NAME,
-                        mm_schemas.ResultData.RESULT_STATUS,
-                    ],
-                    observed=True,
-                )
-                .count()
-            )
-            return df[mm_schemas.ResultData.RESULT_VALUE].to_dict()
+        # Count occurrences by (application_name, result_status) from RawFrame objects
+        count_dict = {}
+        for frame in raw_frames:
+            # Extract column data from each RawFrame
+            app_name = frame.column_data(mm_schemas.ApplicationEvent.APPLICATION_NAME)[
+                0
+            ]
+            statuses = frame.column_data(mm_schemas.ResultData.RESULT_STATUS)
+            for status in statuses:
+                # Filter by result status if specified
+                if result_status_list and status not in result_status_list:
+                    continue
+                # Convert application name to lower case
+                key = (app_name.lower(), status)
+                # Update the count in the dictionary
+                count_dict[key] = count_dict.get(key, 0) + 1
+        return count_dict
     def count_processed_model_endpoints(
         self,
@@ -1450,3 +1524,148 @@ class V3IOTSDBConnector(TSDBConnector):
             return metric_objects
         return build_metric_objects()
+    def get_drift_data(
+        self,
+        start: datetime,
+        end: datetime,
+    ) -> mm_schemas.ModelEndpointDriftValues:
+        table = mm_schemas.V3IOTSDBTables.APP_RESULTS
+        start, end, interval = self._prepare_aligned_start_end(start, end)
+        raw_frames: list[v3io_frames.client.RawFrame] = self._get_records(
+            table=table,
+            start=start,
+            end=end,
+            columns=[mm_schemas.ResultData.RESULT_STATUS],
+            get_raw=True,
+        )
+        if not raw_frames:
+            return mm_schemas.ModelEndpointDriftValues(values=[])
+        aggregated_data = self._aggregate_raw_drift_data(
+            raw_frames=raw_frames, start=start, end=end, interval=interval
+        )
+        if not aggregated_data:
+            return mm_schemas.ModelEndpointDriftValues(values=[])
+        # Filter to only include entries with max result_status >= 1
+        filtered_data = [
+            (endpoint_id, timestamp, max_status)
+            for endpoint_id, timestamp, max_status in aggregated_data
+            if max_status >= 1
+        ]
+        if not filtered_data:
+            return mm_schemas.ModelEndpointDriftValues(values=[])
+        return self._convert_drift_data_to_values(aggregated_data=filtered_data)
+    @staticmethod
+    def _aggregate_raw_drift_data(
+        raw_frames: list[v3io_frames.client.RawFrame],
+        start: datetime,
+        end: datetime,
+        interval: str,
+    ) -> list[tuple[str, datetime, float]]:
+        """
+        Aggregate raw drift data from RawFrame objects.
+        :param raw_frames: List of RawFrame objects containing drift data.
+        :param start:      Start datetime for filtering data.
+        :param end:        End datetime for filtering data.
+        :param interval:   Time interval string (e.g., '5min') for aggregation
+        :returns: list of tuples: (endpoint_id, timestamp, max_result_status)
+        """
+        if not raw_frames:
+            return []
+        # Parse interval to get timedelta
+        interval_td = pd.Timedelta(interval)
+        # Collect all data points from RawFrame objects
+        data_points = []
+        for frame in raw_frames:
+            endpoint_id = frame.column_data(EventFieldType.ENDPOINT_ID)[0]
+            result_statuses = frame.column_data(mm_schemas.ResultData.RESULT_STATUS)
+            timestamps = frame.indices()[0].times
+            # Combine data from this frame
+            for i, (status, timestamp) in enumerate(zip(result_statuses, timestamps)):
+                # V3IO TSDB returns timestamps in nanoseconds
+                timestamp_dt = pd.Timestamp(
+                    timestamp, unit="ns", tzinfo=timezone.utc
+                ).to_pydatetime()
+                # Filter by time window
+                if start <= timestamp_dt < end:
+                    data_points.append((endpoint_id, timestamp_dt, status))
+        if not data_points:
+            return []
+        # Group by endpoint_id and time intervals, then find max status
+        # Create time buckets aligned to start
+        grouped_data = {}
+        for endpoint_id, timestamp, status in data_points:
+            # Calculate which interval bucket this timestamp falls into
+            time_diff = timestamp - start
+            bucket_index = int(time_diff / interval_td)
+            bucket_start = start + (bucket_index * interval_td)
+            key = (endpoint_id, bucket_start)
+            if key not in grouped_data:
+                grouped_data[key] = status
+            else:
+                # Keep the maximum status value
+                grouped_data[key] = max(grouped_data[key], status)
+        # Convert to list of tuples
+        result = [
+            (endpoint_id, timestamp, max_status)
+            for (endpoint_id, timestamp), max_status in grouped_data.items()
+        ]
+        return result
+    @staticmethod
+    def _convert_drift_data_to_values(
+        aggregated_data: list[tuple[str, datetime, float]],
+    ) -> mm_schemas.ModelEndpointDriftValues:
+        """
+        Convert aggregated drift data to ModelEndpointDriftValues format.
+        :param aggregated_data: List of tuples (endpoint_id, timestamp, max_result_status)
+        :return: ModelEndpointDriftValues with counts of suspected and detected per timestamp
+        """
+        suspected_val = mm_schemas.constants.ResultStatusApp.potential_detection.value
+        detected_val = mm_schemas.constants.ResultStatusApp.detected.value
+        # Group by timestamp and result status, then count occurrences
+        timestamp_status_counts = {}
+        for _, timestamp, max_status in aggregated_data:
+            key = (timestamp, max_status)
+            timestamp_status_counts[key] = timestamp_status_counts.get(key, 0) + 1
+        # Organize by timestamp with counts for suspected and detected
+        timestamp_counts = {}
+        for (timestamp, status), count in timestamp_status_counts.items():
+            if timestamp not in timestamp_counts:
+                timestamp_counts[timestamp] = {
+                    "count_suspected": 0,
+                    "count_detected": 0,
+                }
+            if status == suspected_val:
+                timestamp_counts[timestamp]["count_suspected"] = count
+            elif status == detected_val:
+                timestamp_counts[timestamp]["count_detected"] = count
+        # Convert to the expected format: list of (timestamp, count_suspected, count_detected)
+        values = [
+            (timestamp, counts["count_suspected"], counts["count_detected"])
+            for timestamp, counts in sorted(timestamp_counts.items())
+        ]
+        return mm_schemas.ModelEndpointDriftValues(values=values)

mlrun/model_monitoring/helpers.py CHANGED Viewed

@@ -143,7 +143,7 @@ def get_stream_path(
         return stream_uri.replace("v3io://", f"ds://{profile.name}")
     elif isinstance(
-        profile, mlrun.datastore.datastore_profile.DatastoreProfileKafkaSource
+        profile, mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream
     ):
         topic = mlrun.common.model_monitoring.helpers.get_kafka_topic(
             project=project, function_name=function_name
@@ -152,7 +152,7 @@ def get_stream_path(
     else:
         raise mlrun.errors.MLRunValueError(
             f"Received an unexpected stream profile type: {type(profile)}\n"
-            "Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaSource`."
+            "Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaStream`."
         )
@@ -300,7 +300,7 @@ def _get_v3io_output_stream(
 def _get_kafka_output_stream(
     *,
-    kafka_profile: mlrun.datastore.datastore_profile.DatastoreProfileKafkaSource,
+    kafka_profile: mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream,
     project: str,
     function_name: str,
     mock: bool = False,
@@ -356,7 +356,7 @@ def get_output_stream(
         )
     elif isinstance(
-        profile, mlrun.datastore.datastore_profile.DatastoreProfileKafkaSource
+        profile, mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream
     ):
         return _get_kafka_output_stream(
             kafka_profile=profile,
@@ -368,7 +368,7 @@ def get_output_stream(
     else:
         raise mlrun.errors.MLRunValueError(
             f"Received an unexpected stream profile type: {type(profile)}\n"
-            "Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaSource`."
+            "Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaStream`."
         )
@@ -549,6 +549,10 @@ def _get_monitoring_schedules_folder_path(project: str) -> str:
     )
+def _get_monitoring_schedules_user_folder_path(out_path: str) -> str:
+    return os.path.join(out_path, mm_constants.FileTargetKind.MONITORING_SCHEDULES)
 def _get_monitoring_schedules_file_endpoint_path(
     *, project: str, endpoint_id: str
 ) -> str:
@@ -570,10 +574,7 @@ def get_monitoring_schedules_endpoint_data(
     )
-def get_monitoring_schedules_chief_data(
-    *,
-    project: str,
-) -> "DataItem":
+def get_monitoring_schedules_chief_data(*, project: str) -> "DataItem":
     """
     Get the model monitoring schedules' data item of the project's model endpoint.
     """
@@ -582,6 +583,19 @@ def get_monitoring_schedules_chief_data(
     )
+def get_monitoring_schedules_user_application_data(
+    *, out_path: str, application: str
+) -> "DataItem":
+    """
+    Get the model monitoring schedules' data item of user application runs.
+    """
+    return mlrun.datastore.store_manager.object(
+        _get_monitoring_schedules_file_user_application_path(
+            out_path=out_path, application=application
+        )
+    )
 def _get_monitoring_schedules_file_chief_path(
     *,
     project: str,
@@ -591,6 +605,14 @@ def _get_monitoring_schedules_file_chief_path(
     )
+def _get_monitoring_schedules_file_user_application_path(
+    *, out_path: str, application: str
+) -> str:
+    return os.path.join(
+        _get_monitoring_schedules_user_folder_path(out_path), f"{application}.json"
+    )
 def get_start_end(
     start: Union[datetime.datetime, None],
     end: Union[datetime.datetime, None],
@@ -637,3 +659,26 @@ def get_start_end(
         )
     return start, end
+def validate_time_range(
+    start: Optional[datetime.datetime] = None, end: Optional[datetime.datetime] = None
+) -> tuple[datetime.datetime, datetime.datetime]:
+    """
+    validate start and end parameters and set default values if needed.
+    :param start:       Either None or datetime, None is handled as datetime.now(tz=timezone.utc) - timedelta(days=1)
+    :param end:         Either None or datetime, None is handled as datetime.now(tz=timezone.utc)
+    :return:            start datetime, end datetime
+    """
+    end = end or mlrun.utils.helpers.datetime_now()
+    start = start or (end - datetime.timedelta(days=1))
+    if start.tzinfo is None or end.tzinfo is None:
+        raise mlrun.errors.MLRunInvalidArgumentTypeError(
+            "Custom start and end times must contain the timezone."
+        )
+    if start > end:
+        raise mlrun.errors.MLRunInvalidArgumentError(
+            "The start time must be before the end time. Note that if end time is not provided, "
+            "the current time is used by default."
+        )
+    return start, end

mlrun/model_monitoring/stream_processing.py CHANGED Viewed

@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import asyncio
 import datetime
 import typing
@@ -134,6 +134,9 @@ class EventStreamProcessor:
            the default parquet path is under mlrun.mlconf.model_endpoint_monitoring.user_space. Note that if you are
            using CE, the parquet target path is based on the defined MLRun artifact path.
+        In a separate branch, "batch complete" events are forwarded to the controller stream with an intentional delay,
+        to allow for data to first be written to parquet.
         :param fn: A serving function.
         :param tsdb_connector: Time series database connector.
         :param controller_stream_uri: The controller stream URI. Runs on server api pod so needed to be provided as
@@ -145,6 +148,20 @@ class EventStreamProcessor:
             fn.set_topology(mlrun.serving.states.StepKinds.flow, engine="async"),
         )
+        # forward back complete events to controller
+        graph.add_step(
+            "storey.Filter",
+            "FilterBatchComplete",
+            _fn="(event.get('kind') == 'batch_complete')",
+        )
+        graph.add_step(
+            "Delay",
+            name="BatchDelay",
+            after="FilterBatchComplete",
+            delay=self.parquet_batching_timeout_secs + 5,  # add margin
+        )
         # split the graph between event with error vs valid event
         graph.add_step(
             "storey.Filter",
@@ -261,7 +278,7 @@ class EventStreamProcessor:
                 "controller_stream",
                 path=stream_uri,
                 sharding_func=ControllerEvent.ENDPOINT_ID,
-                after="ForwardNOP",
+                after=["ForwardNOP", "BatchDelay"],
                 # Force using the pipeline key instead of the one in the profile in case of v3io profile.
                 # In case of Kafka, this parameter will be ignored.
                 alternative_v3io_access_key="V3IO_ACCESS_KEY",
@@ -309,6 +326,16 @@ class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
         return event
+class Delay(mlrun.feature_store.steps.MapClass):
+    def __init__(self, delay: int, **kwargs):
+        super().__init__(**kwargs)
+        self._delay = delay
+    async def do(self, event):
+        await asyncio.sleep(self._delay)
+        return event
 class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
     def __init__(
         self,
@@ -369,6 +396,8 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
         request_id = event.get("request", {}).get("id") or event.get("resp", {}).get(
             "id"
         )
+        feature_names = event.get("request", {}).get("input_schema")
+        labels_names = event.get("resp", {}).get("output_schema")
         latency = event.get("microsec")
         features = event.get("request", {}).get("inputs")
         predictions = event.get("resp", {}).get("outputs")
@@ -469,6 +498,8 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
                     ),
                     EventFieldType.EFFECTIVE_SAMPLE_COUNT: effective_sample_count,
                     EventFieldType.ESTIMATED_PREDICTION_COUNT: estimated_prediction_count,
+                    EventFieldType.FEATURE_NAMES: feature_names,
+                    EventFieldType.LABEL_NAMES: labels_names,
                 }
             )
@@ -575,19 +606,19 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
         self.endpoint_type = {}
     def _infer_feature_names_from_data(self, event):
-        for endpoint_id in self.feature_names:
-            if len(self.feature_names[endpoint_id]) >= len(
-                event[EventFieldType.FEATURES]
-            ):
-                return self.feature_names[endpoint_id]
+        endpoint_id = event[EventFieldType.ENDPOINT_ID]
+        if endpoint_id in self.feature_names and len(
+            self.feature_names[endpoint_id]
+        ) >= len(event[EventFieldType.FEATURES]):
+            return self.feature_names[endpoint_id]
         return None
     def _infer_label_columns_from_data(self, event):
-        for endpoint_id in self.label_columns:
-            if len(self.label_columns[endpoint_id]) >= len(
-                event[EventFieldType.PREDICTION]
-            ):
-                return self.label_columns[endpoint_id]
+        endpoint_id = event[EventFieldType.ENDPOINT_ID]
+        if endpoint_id in self.label_columns and len(
+            self.label_columns[endpoint_id]
+        ) >= len(event[EventFieldType.PREDICTION]):
+            return self.label_columns[endpoint_id]
         return None
     def do(self, event: dict):
@@ -632,7 +663,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
                     "Feature names are not initialized, they will be automatically generated",
                     endpoint_id=endpoint_id,
                 )
-                feature_names = [
+                feature_names = event.get(EventFieldType.FEATURE_NAMES) or [
                     f"f{i}" for i, _ in enumerate(event[EventFieldType.FEATURES])
                 ]
@@ -655,7 +686,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
                     "label column names are not initialized, they will be automatically generated",
                     endpoint_id=endpoint_id,
                 )
-                label_columns = [
+                label_columns = event.get(EventFieldType.LABEL_NAMES) or [
                     f"p{i}" for i, _ in enumerate(event[EventFieldType.PREDICTION])
                 ]
                 attributes_to_update[EventFieldType.LABEL_NAMES] = label_columns

mlrun 1.10.0rc13__py3-none-any.whl → 1.10.0rc42__py3-none-any.whl

Potentially problematic release.

mlrun 1.10.0rc13py3-none-any.whl → 1.10.0rc42py3-none-any.whl