PyPI - mlrun - Versions diffs - 1.10.0rc13__py3-none-any.whl → 1.10.0rc42__py3-none-any.whl - Mend

mlrun 1.10.0rc13py3-none-any.whl → 1.10.0rc42py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (107) hide show

mlrun/__init__.py +22 -2
mlrun/artifacts/base.py +0 -31
mlrun/artifacts/document.py +6 -1
mlrun/artifacts/llm_prompt.py +123 -25
mlrun/artifacts/manager.py +0 -5
mlrun/artifacts/model.py +3 -3
mlrun/common/constants.py +10 -1
mlrun/common/formatters/artifact.py +1 -0
mlrun/common/model_monitoring/helpers.py +86 -0
mlrun/common/schemas/__init__.py +3 -0
mlrun/common/schemas/auth.py +2 -0
mlrun/common/schemas/function.py +10 -0
mlrun/common/schemas/hub.py +30 -18
mlrun/common/schemas/model_monitoring/__init__.py +3 -0
mlrun/common/schemas/model_monitoring/constants.py +30 -6
mlrun/common/schemas/model_monitoring/functions.py +14 -5
mlrun/common/schemas/model_monitoring/model_endpoints.py +21 -0
mlrun/common/schemas/pipeline.py +1 -1
mlrun/common/schemas/serving.py +3 -0
mlrun/common/schemas/workflow.py +3 -1
mlrun/common/secrets.py +22 -1
mlrun/config.py +33 -11
mlrun/datastore/__init__.py +11 -3
mlrun/datastore/azure_blob.py +162 -47
mlrun/datastore/datastore.py +9 -4
mlrun/datastore/datastore_profile.py +61 -5
mlrun/datastore/model_provider/huggingface_provider.py +363 -0
mlrun/datastore/model_provider/mock_model_provider.py +87 -0
mlrun/datastore/model_provider/model_provider.py +230 -65
mlrun/datastore/model_provider/openai_provider.py +295 -42
mlrun/datastore/s3.py +24 -2
mlrun/datastore/storeytargets.py +2 -3
mlrun/datastore/utils.py +15 -3
mlrun/db/base.py +47 -19
mlrun/db/httpdb.py +120 -56
mlrun/db/nopdb.py +38 -10
mlrun/execution.py +70 -19
mlrun/hub/__init__.py +15 -0
mlrun/hub/module.py +181 -0
mlrun/k8s_utils.py +105 -16
mlrun/launcher/base.py +13 -6
mlrun/launcher/local.py +15 -0
mlrun/model.py +24 -3
mlrun/model_monitoring/__init__.py +1 -0
mlrun/model_monitoring/api.py +66 -27
mlrun/model_monitoring/applications/__init__.py +1 -1
mlrun/model_monitoring/applications/base.py +509 -117
mlrun/model_monitoring/applications/context.py +2 -4
mlrun/model_monitoring/applications/results.py +4 -7
mlrun/model_monitoring/controller.py +239 -101
mlrun/model_monitoring/db/_schedules.py +116 -33
mlrun/model_monitoring/db/_stats.py +4 -3
mlrun/model_monitoring/db/tsdb/base.py +100 -9
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +11 -6
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +191 -50
mlrun/model_monitoring/db/tsdb/tdengine/writer_graph_steps.py +51 -0
mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +259 -40
mlrun/model_monitoring/helpers.py +54 -9
mlrun/model_monitoring/stream_processing.py +45 -14
mlrun/model_monitoring/writer.py +220 -1
mlrun/platforms/__init__.py +3 -2
mlrun/platforms/iguazio.py +7 -3
mlrun/projects/operations.py +6 -1
mlrun/projects/pipelines.py +46 -26
mlrun/projects/project.py +166 -58
mlrun/run.py +94 -17
mlrun/runtimes/__init__.py +18 -0
mlrun/runtimes/base.py +14 -6
mlrun/runtimes/daskjob.py +7 -0
mlrun/runtimes/local.py +5 -2
mlrun/runtimes/mounts.py +20 -2
mlrun/runtimes/mpijob/abstract.py +6 -0
mlrun/runtimes/mpijob/v1.py +6 -0
mlrun/runtimes/nuclio/__init__.py +1 -0
mlrun/runtimes/nuclio/application/application.py +149 -17
mlrun/runtimes/nuclio/function.py +76 -27
mlrun/runtimes/nuclio/serving.py +97 -15
mlrun/runtimes/pod.py +234 -21
mlrun/runtimes/remotesparkjob.py +6 -0
mlrun/runtimes/sparkjob/spark3job.py +6 -0
mlrun/runtimes/utils.py +49 -11
mlrun/secrets.py +54 -13
mlrun/serving/__init__.py +2 -0
mlrun/serving/remote.py +79 -6
mlrun/serving/routers.py +23 -41
mlrun/serving/server.py +320 -80
mlrun/serving/states.py +725 -157
mlrun/serving/steps.py +62 -0
mlrun/serving/system_steps.py +200 -119
mlrun/serving/v2_serving.py +9 -10
mlrun/utils/helpers.py +288 -88
mlrun/utils/logger.py +3 -1
mlrun/utils/notifications/notification/base.py +18 -0
mlrun/utils/notifications/notification/git.py +2 -4
mlrun/utils/notifications/notification/slack.py +2 -4
mlrun/utils/notifications/notification/webhook.py +2 -5
mlrun/utils/notifications/notification_pusher.py +1 -1
mlrun/utils/retryer.py +15 -2
mlrun/utils/version/version.json +2 -2
{mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/METADATA +45 -51
{mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/RECORD +106 -101
mlrun/api/schemas/__init__.py +0 -259
{mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/WHEEL +0 -0
{mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/entry_points.txt +0 -0
{mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/licenses/LICENSE +0 -0
{mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/top_level.txt +0 -0

mlrun/model_monitoring/writer.py CHANGED Viewed

@@ -13,9 +13,12 @@
 # limitations under the License.
 import json
+import typing
 from datetime import datetime, timezone
 from typing import Any, Callable, NewType, Optional
+import storey
 import mlrun.common.model_monitoring
 import mlrun.common.schemas
 import mlrun.common.schemas.alert as alert_objects
@@ -31,6 +34,8 @@ from mlrun.common.schemas.model_monitoring.constants import (
     WriterEvent,
     WriterEventKind,
 )
+from mlrun.config import config
+from mlrun.model_monitoring.db import TSDBConnector
 from mlrun.model_monitoring.db._stats import (
     ModelMonitoringCurrentStatsFile,
     ModelMonitoringDriftMeasuresFile,
@@ -73,7 +78,6 @@ class ModelMonitoringWriter(StepToDict):
         self._tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
             project=self.project, secret_provider=secret_provider
         )
-        self._endpoints_records = {}
     def _generate_event_on_drift(
         self,
@@ -226,3 +230,218 @@ class ModelMonitoringWriter(StepToDict):
             )
         logger.info("Model monitoring writer finished handling event")
+class WriterGraphFactory:
+    def __init__(
+        self,
+        parquet_path: str,
+    ):
+        self.parquet_path = parquet_path
+        self.parquet_batching_max_events = (
+            config.model_endpoint_monitoring.writer_graph.max_events
+        )
+        self.parquet_batching_timeout_secs = (
+            config.model_endpoint_monitoring.writer_graph.parquet_batching_timeout_secs
+        )
+    def apply_writer_graph(
+        self,
+        fn: mlrun.runtimes.ServingRuntime,
+        tsdb_connector: TSDBConnector,
+    ):
+        graph = typing.cast(
+            mlrun.serving.states.RootFlowStep,
+            fn.set_topology(mlrun.serving.states.StepKinds.flow, engine="async"),
+        )
+        graph.to("ReconstructWriterEvent", "event_reconstructor")
+        step = tsdb_connector.add_pre_writer_steps(
+            graph=graph, after="event_reconstructor"
+        )
+        before_choice = step.name if step else "event_reconstructor"
+        graph.add_step("KindChoice", "kind_choice_step", after=before_choice)
+        tsdb_connector.apply_writer_steps(
+            graph=graph,
+            after="kind_choice_step",
+        )
+        graph.add_step(
+            "AlertGenerator",
+            "alert_generator",
+            after="kind_choice_step",
+            project=fn.metadata.project,
+        )
+        graph.add_step(
+            "storey.Filter",
+            name="filter_none",
+            _fn="(event is not None)",
+            after="alert_generator",
+        )
+        graph.add_step(
+            "mlrun.serving.remote.MLRunAPIRemoteStep",
+            name="alert_generator_api_call",
+            after="filter_none",
+            method="POST",
+            path=f"projects/{fn.metadata.project}/events/{{kind}}",
+            fill_placeholders=True,
+        )
+        graph.add_step(
+            "mlrun.datastore.storeytargets.ParquetStoreyTarget",
+            alternative_v3io_access_key=mlrun.common.schemas.model_monitoring.ProjectSecretKeys.ACCESS_KEY,
+            name="stats_writer",
+            after="kind_choice_step",
+            graph_shape="cylinder",
+            path=self.parquet_path
+            if self.parquet_path.endswith("/")
+            else self.parquet_path + "/",
+            max_events=self.parquet_batching_max_events,
+            flush_after_seconds=self.parquet_batching_timeout_secs,
+            columns=[
+                StatsData.TIMESTAMP,
+                StatsData.STATS,
+                WriterEvent.ENDPOINT_ID,
+                StatsData.STATS_NAME,
+            ],
+            partition_cols=[WriterEvent.ENDPOINT_ID, StatsData.STATS_NAME],
+            single_file=True,
+        )
+class ReconstructWriterEvent(storey.MapClass):
+    def __init__(self):
+        super().__init__()
+    def do(self, event: dict) -> dict[str, Any]:
+        logger.info("Reconstructing the event", event=event)
+        kind = event.pop(WriterEvent.EVENT_KIND, WriterEventKind.RESULT)
+        result_event = _AppResultEvent(json.loads(event.pop(WriterEvent.DATA, "{}")))
+        result_event.update(_AppResultEvent(event))
+        expected_keys = list(
+            set(WriterEvent.list()).difference(
+                [WriterEvent.EVENT_KIND, WriterEvent.DATA]
+            )
+        )
+        if kind == WriterEventKind.METRIC:
+            expected_keys.extend(MetricData.list())
+        elif kind == WriterEventKind.RESULT:
+            expected_keys.extend(ResultData.list())
+        elif kind == WriterEventKind.STATS:
+            expected_keys.extend(StatsData.list())
+        else:
+            raise _WriterEventValueError(
+                f"Unknown event kind: {kind}, expected one of: {WriterEventKind.list()}"
+            )
+        missing_keys = [key for key in expected_keys if key not in result_event]
+        if missing_keys:
+            raise _WriterEventValueError(
+                f"The received event misses some keys compared to the expected "
+                f"monitoring application event schema: {missing_keys} for event kind {kind}"
+            )
+        result_event["kind"] = kind
+        if kind in WriterEventKind.user_app_outputs():
+            result_event[WriterEvent.END_INFER_TIME] = datetime.fromisoformat(
+                event[WriterEvent.END_INFER_TIME]
+            )
+        if kind == WriterEventKind.STATS:
+            result_event[StatsData.STATS] = json.dumps(result_event[StatsData.STATS])
+        return result_event
+class KindChoice(storey.Choice):
+    def select_outlets(self, event):
+        kind = event.get("kind")
+        logger.info("Selecting the outlet for the event", kind=kind)
+        if kind == WriterEventKind.METRIC:
+            outlets = ["tsdb_metrics"]
+        elif kind == WriterEventKind.RESULT:
+            outlets = ["tsdb_app_results", "alert_generator"]
+        elif kind == WriterEventKind.STATS:
+            outlets = ["stats_writer"]
+        else:
+            raise _WriterEventValueError(
+                f"Unknown event kind: {kind}, expected one of: {WriterEventKind.list()}"
+            )
+        return outlets
+class AlertGenerator(storey.MapClass):
+    def __init__(self, project: str, **kwargs):
+        self.project = project
+        super().__init__(**kwargs)
+    def do(self, event: dict) -> Optional[dict[str, Any]]:
+        kind = event.pop(WriterEvent.EVENT_KIND, WriterEventKind.RESULT)
+        if (
+            mlrun.mlconf.alerts.mode == mlrun.common.schemas.alert.AlertsModes.enabled
+            and kind == WriterEventKind.RESULT
+            and (
+                event[ResultData.RESULT_STATUS] == ResultStatusApp.detected.value
+                or event[ResultData.RESULT_STATUS]
+                == ResultStatusApp.potential_detection.value
+            )
+        ):
+            event_value = {
+                "app_name": event[WriterEvent.APPLICATION_NAME],
+                "model": event[WriterEvent.ENDPOINT_NAME],
+                "model_endpoint_id": event[WriterEvent.ENDPOINT_ID],
+                "result_name": event[ResultData.RESULT_NAME],
+                "result_value": event[ResultData.RESULT_VALUE],
+            }
+            data = self._generate_event_data(
+                entity_id=get_result_instance_fqn(
+                    event[WriterEvent.ENDPOINT_ID],
+                    event[WriterEvent.APPLICATION_NAME],
+                    event[ResultData.RESULT_NAME],
+                ),
+                result_status=event[ResultData.RESULT_STATUS],
+                event_value=event_value,
+                project_name=self.project,
+                result_kind=event[ResultData.RESULT_KIND],
+            )
+            event = data.dict()
+            logger.info("Generated alert event", event=event)
+            return event
+        return None
+    @staticmethod
+    def _generate_alert_event_kind(
+        result_kind: int, result_status: int
+    ) -> alert_objects.EventKind:
+        """Generate the required Event Kind format for the alerting system"""
+        event_kind = ResultKindApp(value=result_kind).name
+        if result_status == ResultStatusApp.detected.value:
+            event_kind = f"{event_kind}_detected"
+        else:
+            event_kind = f"{event_kind}_suspected"
+        return alert_objects.EventKind(
+            value=mlrun.utils.helpers.normalize_name(event_kind)
+        )
+    def _generate_event_data(
+        self,
+        entity_id: str,
+        result_status: int,
+        event_value: dict,
+        project_name: str,
+        result_kind: int,
+    ) -> mlrun.common.schemas.Event:
+        entity = mlrun.common.schemas.alert.EventEntities(
+            kind=alert_objects.EventEntityKind.MODEL_ENDPOINT_RESULT,
+            project=project_name,
+            ids=[entity_id],
+        )
+        event_kind = self._generate_alert_event_kind(
+            result_status=result_status, result_kind=result_kind
+        )
+        event_data = mlrun.common.schemas.Event(
+            kind=alert_objects.EventKind(value=event_kind),
+            entity=entity,
+            value_dict=event_value,
+        )
+        return event_data

mlrun/platforms/__init__.py CHANGED Viewed

@@ -25,6 +25,7 @@ from .iguazio import (
 )
+# TODO: Remove in 1.11.0
 class _DeprecationHelper:
     """A helper class to deprecate old schemas"""
@@ -48,12 +49,12 @@ class _DeprecationHelper:
     def _warn(self):
         warnings.warn(
             f"mlrun.platforms.{self._new_target} is deprecated since version {self._version}, "
-            f"and will be removed in 1.10. Use mlrun.runtimes.mounts.{self._new_target} instead.",
+            f"and will be removed in 1.11.0. Use mlrun.runtimes.mounts.{self._new_target} instead.",
             FutureWarning,
         )
-# TODO: Remove in 1.10
+# TODO: Remove in 1.11.0
 # For backwards compatibility
 VolumeMount = _DeprecationHelper("VolumeMount")
 auto_mount = _DeprecationHelper("auto_mount")

mlrun/platforms/iguazio.py CHANGED Viewed

@@ -96,7 +96,11 @@ class OutputStream:
         if access_key:
             v3io_client_kwargs["access_key"] = access_key
-        self._v3io_client = v3io.dataplane.Client(**v3io_client_kwargs)
+        if not mock:
+            self._v3io_client = v3io.dataplane.Client(**v3io_client_kwargs)
+        else:
+            self._v3io_client = None
         self._container, self._stream_path = split_path(stream_path)
         self._shards = shards
         self._retention_in_hours = retention_in_hours
@@ -105,7 +109,7 @@ class OutputStream:
         self._mock = mock
         self._mock_queue = []
-    def create_stream(self):
+    def create_stream(self) -> None:
         # this import creates an import loop via the utils module, so putting it in execution path
         from mlrun.utils.helpers import logger
@@ -210,7 +214,7 @@ class KafkaOutputStream:
         self._initialized = False
     def _lazy_init(self):
-        if self._initialized:
+        if self._initialized or self._mock:
             return
         import kafka

mlrun/projects/operations.py CHANGED Viewed

@@ -177,7 +177,12 @@ def run_function(
                             This ensures latest code changes are executed. This argument must be used in
                             conjunction with the local=True argument.
     :param output_path:     path to store artifacts, when running in a workflow this will be set automatically
-    :param retry:           Retry configuration for the run, can be a dict or an instance of mlrun.model.Retry.
+    :param retry:           Retry configuration for the run, can be a dict or an instance of
+                            :py:class:`~mlrun.model.Retry`.
+                            The `count` field in the `Retry` object specifies the number of retry attempts.
+                            If `count=0`, the run will not be retried.
+                            The `backoff` field specifies the retry backoff strategy between retry attempts.
+                            If not provided, the default backoff delay is 30 seconds.
     :return: MLRun RunObject or PipelineNodeWrapper
     """
     if artifact_path:

mlrun/projects/pipelines.py CHANGED Viewed

@@ -228,11 +228,11 @@ class _PipelineContext:
         force_run_local = mlrun.mlconf.force_run_local
         if force_run_local is None or force_run_local == "auto":
             force_run_local = not mlrun.mlconf.is_api_running_on_k8s()
+        if self.workflow:
             if not mlrun.mlconf.kfp_url:
                 logger.debug("Kubeflow pipeline URL is not set, running locally")
                 force_run_local = True
-        if self.workflow:
             force_run_local = force_run_local or self.workflow.run_local
         return force_run_local
@@ -1072,7 +1072,11 @@ def github_webhook(request):
 def rerun_workflow(
-    context: mlrun.execution.MLClientCtx, run_uid: str, project_name: str
+    context: mlrun.execution.MLClientCtx,
+    run_uid: str,
+    project_name: str,
+    original_runner_uid: str,
+    original_workflow_name: str,
 ):
     """
     Re-run a workflow by retrying a previously failed KFP pipeline.
@@ -1080,8 +1084,11 @@ def rerun_workflow(
     :param context:      MLRun context.
     :param run_uid:      The run UID of the original workflow to retry.
     :param project_name: The project name.
+    :param original_runner_uid: The original workflow runner UID.
+    :param original_workflow_name: The original workflow name.
     """
     db = mlrun.get_run_db()
+    new_pipeline_id = None
     try:
         # Invoke the KFP retry endpoint (direct-submit mode)
@@ -1096,6 +1103,24 @@ def rerun_workflow(
             rerun_of_workflow=run_uid,
         )
+        # Enqueue "running" notifications server-side for this RerunRunner run
+        db.push_run_notifications(context.uid, project_name)
+        context.set_label(
+            mlrun_constants.MLRunInternalLabels.workflow_id, new_pipeline_id
+        )
+        context.update_run()
+        context.log_result("workflow_id", new_pipeline_id)
+        pipeline = wait_for_pipeline_completion(
+            new_pipeline_id,
+            project=project_name,
+        )
+        final_state = pipeline["run"]["status"]
+        context.log_result("workflow_state", final_state, commit=True)
     except mlrun.errors.MLRunHTTPError as http_exc:
         logger.error(
             "Failed calling KFP retry API",
@@ -1104,33 +1129,28 @@ def rerun_workflow(
         )
         raise
-    # Enqueue "running" notifications server-side for this RerunRunner run
-    db.push_run_notifications(context.uid, project_name)
-    context.set_label(mlrun_constants.MLRunInternalLabels.workflow_id, new_pipeline_id)
-    context.update_run()
-    context.log_result("workflow_id", new_pipeline_id)
-    try:
-        pipeline = wait_for_pipeline_completion(
-            new_pipeline_id,
-            project=project_name,
-        )
     except Exception as exc:
-        mlrun.utils.logger.error(
-            "Failed waiting for workflow completion",
+        logger.error(
+            "Error during rerun_workflow execution",
+            error=err_to_str(exc),
             rerun_pipeline_id=new_pipeline_id,
-            exc=err_to_str(exc),
         )
-    else:
-        final_state = pipeline["run"]["status"]
-        context.log_result("workflow_state", final_state, commit=True)
+        raise
-        if final_state != mlrun_pipelines.common.models.RunStatuses.succeeded:
-            raise mlrun.errors.MLRunRuntimeError(
-                f"Pipeline retry of {run_uid} finished in state={final_state}"
-            )
+    finally:
+        # Once the rerun has finished, clear the “retrying” label on the original runner
+        # so that subsequent retry requests can acquire the lock again.
+        db.set_run_retrying_status(
+            project=project_name,
+            name=original_workflow_name,
+            run_id=original_runner_uid,
+            retrying=False,
+        )
+    if final_state != mlrun_pipelines.common.models.RunStatuses.succeeded:
+        raise mlrun.errors.MLRunRuntimeError(
+            f"Pipeline retry of {run_uid} finished in state={final_state}"
+        )
 def load_and_run(context, *args, **kwargs):

mlrun 1.10.0rc13__py3-none-any.whl → 1.10.0rc42__py3-none-any.whl

Potentially problematic release.

mlrun 1.10.0rc13py3-none-any.whl → 1.10.0rc42py3-none-any.whl