PyPI - mlrun - Versions diffs - 1.10.0rc16__py3-none-any.whl → 1.10.1rc4__py3-none-any.whl - Mend

mlrun 1.10.0rc16py3-none-any.whl → 1.10.1rc4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (101) hide show

mlrun/__init__.py +22 -2
mlrun/artifacts/document.py +6 -1
mlrun/artifacts/llm_prompt.py +21 -15
mlrun/artifacts/model.py +3 -3
mlrun/common/constants.py +9 -0
mlrun/common/formatters/artifact.py +1 -0
mlrun/common/model_monitoring/helpers.py +86 -0
mlrun/common/schemas/__init__.py +2 -0
mlrun/common/schemas/auth.py +2 -0
mlrun/common/schemas/function.py +10 -0
mlrun/common/schemas/hub.py +30 -18
mlrun/common/schemas/model_monitoring/__init__.py +2 -0
mlrun/common/schemas/model_monitoring/constants.py +30 -6
mlrun/common/schemas/model_monitoring/functions.py +13 -4
mlrun/common/schemas/model_monitoring/model_endpoints.py +11 -0
mlrun/common/schemas/pipeline.py +1 -1
mlrun/common/schemas/serving.py +3 -0
mlrun/common/schemas/workflow.py +1 -0
mlrun/common/secrets.py +22 -1
mlrun/config.py +34 -21
mlrun/datastore/__init__.py +11 -3
mlrun/datastore/azure_blob.py +162 -47
mlrun/datastore/base.py +265 -7
mlrun/datastore/datastore.py +10 -5
mlrun/datastore/datastore_profile.py +61 -5
mlrun/datastore/model_provider/huggingface_provider.py +367 -0
mlrun/datastore/model_provider/mock_model_provider.py +87 -0
mlrun/datastore/model_provider/model_provider.py +211 -74
mlrun/datastore/model_provider/openai_provider.py +243 -71
mlrun/datastore/s3.py +24 -2
mlrun/datastore/store_resources.py +4 -4
mlrun/datastore/storeytargets.py +2 -3
mlrun/datastore/utils.py +15 -3
mlrun/db/base.py +27 -19
mlrun/db/httpdb.py +57 -48
mlrun/db/nopdb.py +25 -10
mlrun/execution.py +55 -13
mlrun/hub/__init__.py +15 -0
mlrun/hub/module.py +181 -0
mlrun/k8s_utils.py +105 -16
mlrun/launcher/base.py +13 -6
mlrun/launcher/local.py +2 -0
mlrun/model.py +9 -3
mlrun/model_monitoring/api.py +66 -27
mlrun/model_monitoring/applications/__init__.py +1 -1
mlrun/model_monitoring/applications/base.py +388 -138
mlrun/model_monitoring/applications/context.py +2 -4
mlrun/model_monitoring/applications/results.py +4 -7
mlrun/model_monitoring/controller.py +239 -101
mlrun/model_monitoring/db/_schedules.py +36 -13
mlrun/model_monitoring/db/_stats.py +4 -3
mlrun/model_monitoring/db/tsdb/base.py +29 -9
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +4 -5
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +154 -50
mlrun/model_monitoring/db/tsdb/tdengine/writer_graph_steps.py +51 -0
mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +245 -51
mlrun/model_monitoring/helpers.py +28 -5
mlrun/model_monitoring/stream_processing.py +45 -14
mlrun/model_monitoring/writer.py +220 -1
mlrun/platforms/__init__.py +3 -2
mlrun/platforms/iguazio.py +7 -3
mlrun/projects/operations.py +16 -11
mlrun/projects/pipelines.py +2 -2
mlrun/projects/project.py +157 -69
mlrun/run.py +97 -20
mlrun/runtimes/__init__.py +18 -0
mlrun/runtimes/base.py +14 -6
mlrun/runtimes/daskjob.py +1 -0
mlrun/runtimes/local.py +5 -2
mlrun/runtimes/mounts.py +20 -2
mlrun/runtimes/nuclio/__init__.py +1 -0
mlrun/runtimes/nuclio/application/application.py +147 -17
mlrun/runtimes/nuclio/function.py +72 -27
mlrun/runtimes/nuclio/serving.py +102 -20
mlrun/runtimes/pod.py +213 -21
mlrun/runtimes/utils.py +49 -9
mlrun/secrets.py +54 -13
mlrun/serving/remote.py +79 -6
mlrun/serving/routers.py +23 -41
mlrun/serving/server.py +230 -40
mlrun/serving/states.py +605 -232
mlrun/serving/steps.py +62 -0
mlrun/serving/system_steps.py +136 -81
mlrun/serving/v2_serving.py +9 -10
mlrun/utils/helpers.py +215 -83
mlrun/utils/logger.py +3 -1
mlrun/utils/notifications/notification/base.py +18 -0
mlrun/utils/notifications/notification/git.py +2 -4
mlrun/utils/notifications/notification/mail.py +38 -15
mlrun/utils/notifications/notification/slack.py +2 -4
mlrun/utils/notifications/notification/webhook.py +2 -5
mlrun/utils/notifications/notification_pusher.py +1 -1
mlrun/utils/version/version.json +2 -2
{mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/METADATA +51 -50
{mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/RECORD +100 -95
mlrun/api/schemas/__init__.py +0 -259
{mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/WHEEL +0 -0
{mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/entry_points.txt +0 -0
{mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/licenses/LICENSE +0 -0
{mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/top_level.txt +0 -0

mlrun/serving/server.py CHANGED Viewed

@@ -17,21 +17,26 @@ __all__ = ["GraphServer", "create_graph_server", "GraphContext", "MockEvent"]
 import asyncio
 import base64
 import copy
+import importlib
 import json
 import os
 import socket
 import traceback
 import uuid
+from collections import defaultdict
+from datetime import datetime, timezone
 from typing import Any, Optional, Union
+import pandas as pd
 import storey
 from nuclio import Context as NuclioContext
 from nuclio.request import Logger as NuclioLogger
 import mlrun
-import mlrun.common.constants
 import mlrun.common.helpers
 import mlrun.common.schemas
+import mlrun.common.schemas.model_monitoring.constants as mm_constants
+import mlrun.datastore.datastore_profile as ds_profile
 import mlrun.model_monitoring
 import mlrun.utils
 from mlrun.config import config
@@ -40,12 +45,13 @@ from mlrun.secrets import SecretsStore
 from ..common.helpers import parse_versioned_object_uri
 from ..common.schemas.model_monitoring.constants import FileTargetKind
+from ..common.schemas.serving import MAX_BATCH_JOB_DURATION
 from ..datastore import DataItem, get_stream_pusher
 from ..datastore.store_resources import ResourceCache
 from ..errors import MLRunInvalidArgumentError
 from ..execution import MLClientCtx
 from ..model import ModelObj
-from ..utils import get_caller_globals
+from ..utils import get_caller_globals, get_relative_module_name_from_path
 from .states import (
     FlowStep,
     MonitoredStep,
@@ -77,7 +83,6 @@ class _StreamContext:
         self.hostname = socket.gethostname()
         self.function_uri = function_uri
         self.output_stream = None
-        stream_uri = None
         log_stream = parameters.get(FileTargetKind.LOG_STREAM, "")
         if (enabled or log_stream) and function_uri:
@@ -88,20 +93,16 @@ class _StreamContext:
             stream_args = parameters.get("stream_args", {})
-            if log_stream == DUMMY_STREAM:
-                # Dummy stream used for testing, see tests/serving/test_serving.py
-                stream_uri = DUMMY_STREAM
-            elif not stream_args.get("mock"):  # if not a mock: `context.is_mock = True`
-                stream_uri = mlrun.model_monitoring.get_stream_path(project=project)
             if log_stream:
-                # Update the stream path to the log stream value
-                stream_uri = log_stream.format(project=project)
-                self.output_stream = get_stream_pusher(stream_uri, **stream_args)
+                # Get the output stream from the log stream path
+                stream_path = log_stream.format(project=project)
+                self.output_stream = get_stream_pusher(stream_path, **stream_args)
             else:
                 # Get the output stream from the profile
                 self.output_stream = mlrun.model_monitoring.helpers.get_output_stream(
-                    project=project, mock=stream_args.get("mock", False)
+                    project=project,
+                    profile=parameters.get("stream_profile"),
+                    mock=stream_args.get("mock", False),
                 )
@@ -179,11 +180,12 @@ class GraphServer(ModelObj):
         self,
         context,
         namespace,
-        resource_cache: ResourceCache = None,
+        resource_cache: Optional[ResourceCache] = None,
         logger=None,
         is_mock=False,
         monitoring_mock=False,
-    ):
+        stream_profile: Optional[ds_profile.DatastoreProfile] = None,
+    ) -> None:
         """for internal use, initialize all steps (recursively)"""
         if self.secret_sources:
@@ -198,6 +200,20 @@ class GraphServer(ModelObj):
         context.monitoring_mock = monitoring_mock
         context.root = self.graph
+        if is_mock and monitoring_mock:
+            if stream_profile:
+                # Add the user-defined stream profile to the parameters
+                self.parameters["stream_profile"] = stream_profile
+            elif not (
+                self.parameters.get(FileTargetKind.LOG_STREAM)
+                or mlrun.get_secret_or_env(
+                    mm_constants.ProjectSecretKeys.STREAM_PROFILE_NAME
+                )
+            ):
+                # Set a dummy log stream for mocking purposes if there is no direct
+                # user-defined stream profile and no information in the environment
+                self.parameters[FileTargetKind.LOG_STREAM] = DUMMY_STREAM
         context.stream = _StreamContext(
             self.track_models, self.parameters, self.function_uri
         )
@@ -358,6 +374,7 @@ def add_error_raiser_step(
             raise_exception=monitored_step.raise_exception,
             models_names=list(monitored_step.class_args["models"].keys()),
             model_endpoint_creation_strategy=mlrun.common.schemas.ModelEndpointCreationStrategy.SKIP,
+            function=monitored_step.function,
         )
         if monitored_step.responder:
             monitored_step.responder = False
@@ -400,6 +417,7 @@ def add_monitoring_general_steps(
             "mlrun.serving.system_steps.BackgroundTaskStatus",
             "background_task_status_step",
             model_endpoint_creation_strategy=mlrun.common.schemas.ModelEndpointCreationStrategy.SKIP,
+            full_event=True,
         )
     monitor_flow_step = graph.add_step(
         "storey.Filter",
@@ -505,10 +523,6 @@ def add_system_steps_to_graph(
                 monitor_flow_step.after = [
                     step_name,
                 ]
-    context.logger.info_with(
-        "Server graph after adding system steps",
-        graph=str(graph.steps),
-    )
     return graph
@@ -561,25 +575,51 @@ def v2_serving_init(context, namespace=None):
 async def async_execute_graph(
     context: MLClientCtx,
     data: DataItem,
+    timestamp_column: Optional[str],
     batching: bool,
     batch_size: Optional[int],
     read_as_lists: bool,
     nest_under_inputs: bool,
-) -> list[Any]:
+) -> None:
+    # Validate that data parameter is a DataItem and not passed via params
+    if not isinstance(data, DataItem):
+        raise MLRunInvalidArgumentError(
+            f"Parameter 'data' has type hint 'DataItem' but got {type(data).__name__} instead. "
+            f"Data files and artifacts must be passed via the 'inputs' parameter, not 'params'. "
+            f"The 'params' parameter is for simple configuration values (strings, numbers, booleans), "
+            f"while 'inputs' is for data files that need to be loaded. "
+            f"Example: run_function(..., inputs={{'data': 'path/to/data.csv'}}, params={{other_config: value}})"
+        )
+    run_call_count = 0
     spec = mlrun.utils.get_serving_spec()
-    namespace = {}
+    modname = None
     code = os.getenv("MLRUN_EXEC_CODE")
     if code:
         code = base64.b64decode(code).decode("utf-8")
-        exec(code, namespace)
+        with open("user_code.py", "w") as fp:
+            fp.write(code)
+        modname = "user_code"
     else:
         # TODO: find another way to get the local file path, or ensure that MLRUN_EXEC_CODE
         #  gets set in local flow and not just in the remote pod
-        source_filename = spec.get("filename", None)
-        if source_filename:
-            with open(source_filename) as f:
-                exec(f.read(), namespace)
+        source_file_path = spec.get("filename", None)
+        if source_file_path:
+            source_file_path_object, working_dir_path_object = (
+                mlrun.utils.helpers.get_source_and_working_dir_paths(source_file_path)
+            )
+            if not source_file_path_object.is_relative_to(working_dir_path_object):
+                raise mlrun.errors.MLRunRuntimeError(
+                    f"Source file path '{source_file_path}' is not under the current working directory "
+                    f"(which is required when running with local=True)"
+                )
+            modname = get_relative_module_name_from_path(
+                source_file_path_object, working_dir_path_object
+            )
+    namespace = {}
+    if modname:
+        mod = importlib.import_module(modname)
+        namespace = mod.__dict__
     server = GraphServer.from_dict(spec)
@@ -605,10 +645,43 @@ async def async_execute_graph(
                 f"(status='{task_state}')"
             )
+    df = data.as_df()
+    if df.empty:
+        context.logger.warn("Job terminated due to empty inputs (0 rows)")
+        return
+    track_models = spec.get("track_models")
+    if track_models and timestamp_column:
+        context.logger.info(f"Sorting dataframe by {timestamp_column}")
+        df[timestamp_column] = pd.to_datetime(  # in case it's a string
+            df[timestamp_column]
+        )
+        df.sort_values(by=timestamp_column, inplace=True)
+        if len(df) > 1:
+            start_time = df[timestamp_column].iloc[0]
+            end_time = df[timestamp_column].iloc[-1]
+            time_range = end_time - start_time
+            start_time = start_time.isoformat()
+            end_time = end_time.isoformat()
+            # TODO: tie this to the controller's base period
+            if time_range > pd.Timedelta(MAX_BATCH_JOB_DURATION):
+                raise mlrun.errors.MLRunRuntimeError(
+                    f"Dataframe time range is too long: {time_range}. "
+                    "Please disable tracking or reduce the input dataset's time range below the defined limit "
+                    f"of {MAX_BATCH_JOB_DURATION}."
+                )
+        else:
+            start_time = end_time = df["timestamp"].iloc[0].isoformat()
+    else:
+        # end time will be set from clock time when the batch completes
+        start_time = datetime.now(tz=timezone.utc).isoformat()
     server.graph = add_system_steps_to_graph(
         server.project,
         copy.deepcopy(server.graph),
-        spec.get("track_models"),
+        track_models,
         context,
         spec,
         pause_until_background_task_completion=False,  # we've already awaited it
@@ -616,7 +689,6 @@ async def async_execute_graph(
     if config.log_level.lower() == "debug":
         server.verbose = True
-    context.logger.info_with("Initializing states", namespace=namespace)
     kwargs = {}
     if hasattr(context, "is_mock"):
         kwargs["is_mock"] = context.is_mock
@@ -633,19 +705,30 @@ async def async_execute_graph(
     if server.verbose:
         context.logger.info(server.to_yaml())
-    df = data.as_df()
-    responses = []
     async def run(body):
+        nonlocal run_call_count
         event = storey.Event(id=index, body=body)
-        response = await server.run(event, context)
-        responses.append(response)
+        if timestamp_column:
+            if batching:
+                # we use the first row in the batch to determine the timestamp for the whole batch
+                body = body[0]
+            if not isinstance(body, dict):
+                raise mlrun.errors.MLRunRuntimeError(
+                    f"When timestamp_column=True, event body must be a dict – got {type(body).__name__} instead"
+                )
+            if timestamp_column not in body:
+                raise mlrun.errors.MLRunRuntimeError(
+                    f"Event body '{body}' did not contain timestamp column '{timestamp_column}'"
+                )
+            event._original_timestamp = body[timestamp_column]
+        run_call_count += 1
+        return await server.run(event, context)
     if batching and not batch_size:
         batch_size = len(df)
     batch = []
+    tasks = []
     for index, row in df.iterrows():
         data = row.to_list() if read_as_lists else row.to_dict()
         if nest_under_inputs:
@@ -653,24 +736,119 @@ async def async_execute_graph(
         if batching:
             batch.append(data)
             if len(batch) == batch_size:
-                await run(batch)
+                tasks.append(asyncio.create_task(run(batch)))
                 batch = []
         else:
-            await run(data)
+            tasks.append(asyncio.create_task(run(data)))
     if batch:
-        await run(batch)
+        tasks.append(asyncio.create_task(run(batch)))
+    responses = await asyncio.gather(*tasks)
     termination_result = server.wait_for_completion()
     if asyncio.iscoroutine(termination_result):
         await termination_result
-    return responses
+    model_endpoint_uids = spec.get("model_endpoint_uids", [])
+    # needed for output_stream to be created
+    server = GraphServer.from_dict(spec)
+    server.init_states(None, namespace)
+    batch_completion_time = datetime.now(tz=timezone.utc).isoformat()
+    if not timestamp_column:
+        end_time = batch_completion_time
+    mm_stream_record = dict(
+        kind="batch_complete",
+        project=context.project,
+        first_timestamp=start_time,
+        last_timestamp=end_time,
+        batch_completion_time=batch_completion_time,
+    )
+    output_stream = server.context.stream.output_stream
+    for mep_uid in spec.get("model_endpoint_uids", []):
+        mm_stream_record["endpoint_id"] = mep_uid
+        output_stream.push(mm_stream_record, partition_key=mep_uid)
+    context.logger.info(
+        f"Job completed processing {len(df)} rows",
+        timestamp_column=timestamp_column,
+        model_endpoint_uids=model_endpoint_uids,
+    )
+    has_responder = False
+    for step in server.graph.steps.values():
+        if getattr(step, "responder", False):
+            has_responder = True
+            break
+    if has_responder:
+        # log the results as a dataset artifact
+        artifact_path = None
+        if (
+            "{{run.uid}}" not in context.artifact_path
+        ):  # TODO: delete when IG-22841 is resolved
+            artifact_path = "+/{{run.uid}}"  # will be concatenated to the context's path in extend_artifact_path
+        context.log_dataset(
+            "prediction", df=pd.DataFrame(responses), artifact_path=artifact_path
+        )
+        # if we got responses that appear to be in the right format, try to log per-model datasets too
+        if (
+            responses
+            and responses[0]
+            and isinstance(responses[0], dict)
+            and isinstance(next(iter(responses[0].values())), (dict, list))
+        ):
+            try:
+                # turn this list of samples into a dict of lists, one per model endpoint
+                grouped = defaultdict(list)
+                for sample in responses:
+                    for model_name, features in sample.items():
+                        grouped[model_name].append(features)
+                # create a dataframe per model endpoint and log it
+                for model_name, features in grouped.items():
+                    context.log_dataset(
+                        f"prediction_{model_name}",
+                        df=pd.DataFrame(features),
+                        artifact_path=artifact_path,
+                    )
+            except Exception as e:
+                context.logger.warning(
+                    "Failed to log per-model prediction datasets",
+                    error=err_to_str(e),
+                )
+    context.log_result("num_rows", run_call_count)
+def _is_inside_asyncio_loop():
+    try:
+        asyncio.get_running_loop()
+        return True
+    except RuntimeError:
+        return False
+# Workaround for running with local=True in Jupyter (ML-10620)
+def _workaround_asyncio_nesting():
+    try:
+        import nest_asyncio
+    except ImportError:
+        raise mlrun.errors.MLRunRuntimeError(
+            "Cannot execute graph from within an already running asyncio loop. "
+            "Attempt to import nest_asyncio as a workaround failed as well."
+        )
+    nest_asyncio.apply()
 def execute_graph(
     context: MLClientCtx,
     data: DataItem,
+    timestamp_column: Optional[str] = None,
     batching: bool = False,
     batch_size: Optional[int] = None,
     read_as_lists: bool = False,
@@ -681,6 +859,9 @@ def execute_graph(
     :param context: The job's execution client context.
     :param data: The input data to the job, to be pushed into the graph row by row, or in batches.
+    :param timestamp_column: The name of the column that will be used as the timestamp for model monitoring purposes.
+        when timestamp_column is used in conjunction with batching, the first timestamp will be used for the entire
+        batch.
     :param batching: Whether to push one or more batches into the graph rather than row by row.
     :param batch_size: The number of rows to push per batch. If not set, and batching=True, the entire dataset will
         be pushed into the graph in one batch.
@@ -689,9 +870,18 @@ def execute_graph(
     :return: A list of responses.
     """
+    if _is_inside_asyncio_loop():
+        _workaround_asyncio_nesting()
     return asyncio.run(
         async_execute_graph(
-            context, data, batching, batch_size, read_as_lists, nest_under_inputs
+            context,
+            data,
+            timestamp_column,
+            batching,
+            batch_size,
+            read_as_lists,
+            nest_under_inputs,
         )
     )

mlrun 1.10.0rc16__py3-none-any.whl → 1.10.1rc4__py3-none-any.whl

Potentially problematic release.

mlrun 1.10.0rc16py3-none-any.whl → 1.10.1rc4py3-none-any.whl