PyPI - mlrun - Versions diffs - 1.6.2rc5__py3-none-any.whl → 1.6.3__py3-none-any.whl - Mend

mlrun 1.6.2rc5py3-none-any.whl → 1.6.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (31) hide show

mlrun/artifacts/model.py +28 -22
mlrun/common/schemas/__init__.py +1 -0
mlrun/common/schemas/model_monitoring/__init__.py +1 -0
mlrun/common/schemas/model_monitoring/constants.py +21 -5
mlrun/config.py +32 -12
mlrun/data_types/data_types.py +4 -0
mlrun/datastore/v3io.py +27 -50
mlrun/db/auth_utils.py +152 -0
mlrun/db/httpdb.py +51 -30
mlrun/lists.py +2 -0
mlrun/model.py +26 -2
mlrun/model_monitoring/controller.py +0 -7
mlrun/model_monitoring/features_drift_table.py +6 -0
mlrun/model_monitoring/helpers.py +4 -1
mlrun/model_monitoring/stream_processing.py +50 -37
mlrun/projects/pipelines.py +30 -6
mlrun/projects/project.py +20 -23
mlrun/render.py +13 -4
mlrun/run.py +2 -0
mlrun/runtimes/pod.py +5 -5
mlrun/utils/async_http.py +25 -5
mlrun/utils/helpers.py +12 -0
mlrun/utils/logger.py +11 -6
mlrun/utils/version/version.json +2 -2
{mlrun-1.6.2rc5.dist-info → mlrun-1.6.3.dist-info}/METADATA +9 -7
{mlrun-1.6.2rc5.dist-info → mlrun-1.6.3.dist-info}/RECORD +30 -30
mlrun/datastore/helpers.py +0 -18
{mlrun-1.6.2rc5.dist-info → mlrun-1.6.3.dist-info}/LICENSE +0 -0
{mlrun-1.6.2rc5.dist-info → mlrun-1.6.3.dist-info}/WHEEL +0 -0
{mlrun-1.6.2rc5.dist-info → mlrun-1.6.3.dist-info}/entry_points.txt +0 -0
{mlrun-1.6.2rc5.dist-info → mlrun-1.6.3.dist-info}/top_level.txt +0 -0

mlrun/artifacts/model.py CHANGED Viewed

@@ -13,8 +13,9 @@
 # limitations under the License.
 import tempfile
 from os import path
-from typing import List
+from typing import Any
+import pandas as pd
 import yaml
 from deprecated import deprecated
@@ -68,8 +69,8 @@ class ModelArtifactSpec(ArtifactSpec):
         model_file=None,
         metrics=None,
         paraemeters=None,
-        inputs: List[Feature] = None,
-        outputs: List[Feature] = None,
+        inputs: list[Feature] = None,
+        outputs: list[Feature] = None,
         framework=None,
         algorithm=None,
         feature_vector=None,
@@ -91,8 +92,8 @@ class ModelArtifactSpec(ArtifactSpec):
         self.model_file = model_file
         self.metrics = metrics or {}
         self.parameters = paraemeters or {}
-        self.inputs: List[Feature] = inputs or []
-        self.outputs: List[Feature] = outputs or []
+        self.inputs: list[Feature] = inputs or []
+        self.outputs: list[Feature] = outputs or []
         self.framework = framework
         self.algorithm = algorithm
         self.feature_vector = feature_vector
@@ -101,21 +102,21 @@ class ModelArtifactSpec(ArtifactSpec):
         self.model_target_file = model_target_file
     @property
-    def inputs(self) -> List[Feature]:
+    def inputs(self) -> list[Feature]:
         """input feature list"""
         return self._inputs
     @inputs.setter
-    def inputs(self, inputs: List[Feature]):
+    def inputs(self, inputs: list[Feature]):
         self._inputs = ObjectList.from_list(Feature, inputs)
     @property
-    def outputs(self) -> List[Feature]:
+    def outputs(self) -> list[Feature]:
         """output feature list"""
         return self._outputs
     @outputs.setter
-    def outputs(self, outputs: List[Feature]):
+    def outputs(self, outputs: list[Feature]):
         self._outputs = ObjectList.from_list(Feature, outputs)
@@ -175,22 +176,22 @@ class ModelArtifact(Artifact):
         self._spec = self._verify_dict(spec, "spec", ModelArtifactSpec)
     @property
-    def inputs(self) -> List[Feature]:
+    def inputs(self) -> list[Feature]:
         """input feature list"""
         return self.spec.inputs
     @inputs.setter
-    def inputs(self, inputs: List[Feature]):
+    def inputs(self, inputs: list[Feature]):
         """input feature list"""
         self.spec.inputs = inputs
     @property
-    def outputs(self) -> List[Feature]:
+    def outputs(self) -> list[Feature]:
         """input feature list"""
         return self.spec.outputs
     @outputs.setter
-    def outputs(self, outputs: List[Feature]):
+    def outputs(self, outputs: list[Feature]):
         """input feature list"""
         self.spec.outputs = outputs
@@ -260,6 +261,7 @@ class ModelArtifact(Artifact):
         """
         subset = df
         inferer = get_infer_interface(subset)
+        numeric_columns = self._extract_numeric_features(df)
         if label_columns:
             if not isinstance(label_columns, list):
                 label_columns = [label_columns]
@@ -273,9 +275,13 @@ class ModelArtifact(Artifact):
             )
         if with_stats:
             self.spec.feature_stats = inferer.get_stats(
-                df, options=InferOptions.Histogram, num_bins=num_bins
+                df[numeric_columns], options=InferOptions.Histogram, num_bins=num_bins
             )
+    @staticmethod
+    def _extract_numeric_features(df: pd.DataFrame) -> list[Any]:
+        return [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col])]
     @property
     def is_dir(self):
         return True
@@ -445,8 +451,8 @@ class LegacyModelArtifact(LegacyArtifact):
         self.model_file = model_file
         self.parameters = parameters or {}
         self.metrics = metrics or {}
-        self.inputs: List[Feature] = inputs or []
-        self.outputs: List[Feature] = outputs or []
+        self.inputs: list[Feature] = inputs or []
+        self.outputs: list[Feature] = outputs or []
         self.extra_data = extra_data or {}
         self.framework = framework
         self.algorithm = algorithm
@@ -456,21 +462,21 @@ class LegacyModelArtifact(LegacyArtifact):
         self.model_target_file = model_target_file
     @property
-    def inputs(self) -> List[Feature]:
+    def inputs(self) -> list[Feature]:
         """input feature list"""
         return self._inputs
     @inputs.setter
-    def inputs(self, inputs: List[Feature]):
+    def inputs(self, inputs: list[Feature]):
         self._inputs = ObjectList.from_list(Feature, inputs)
     @property
-    def outputs(self) -> List[Feature]:
+    def outputs(self) -> list[Feature]:
         """output feature list"""
         return self._outputs
     @outputs.setter
-    def outputs(self, outputs: List[Feature]):
+    def outputs(self, outputs: list[Feature]):
         self._outputs = ObjectList.from_list(Feature, outputs)
     def infer_from_df(self, df, label_columns=None, with_stats=True, num_bins=None):
@@ -642,8 +648,8 @@ def update_model(
     parameters: dict = None,
     metrics: dict = None,
     extra_data: dict = None,
-    inputs: List[Feature] = None,
-    outputs: List[Feature] = None,
+    inputs: list[Feature] = None,
+    outputs: list[Feature] = None,
     feature_vector: str = None,
     feature_weights: list = None,
     key_prefix: str = "",

mlrun/common/schemas/__init__.py CHANGED Viewed

@@ -114,6 +114,7 @@ from .model_monitoring import (
     EventFieldType,
     EventKeyMetrics,
     Features,
+    FeatureSetFeatures,
     FeatureValues,
     GrafanaColumn,
     GrafanaDataPoint,

mlrun/common/schemas/model_monitoring/__init__.py CHANGED Viewed

@@ -22,6 +22,7 @@ from .constants import (
     EventFieldType,
     EventKeyMetrics,
     EventLiveStats,
+    FeatureSetFeatures,
     FileTargetKind,
     FunctionURI,
     ModelEndpointTarget,

mlrun/common/schemas/model_monitoring/constants.py CHANGED Viewed

@@ -77,6 +77,26 @@ class EventFieldType:
     SAMPLE_PARQUET_PATH = "sample_parquet_path"
+class MonitoringStrEnum(StrEnum):
+    @classmethod
+    def list(cls):
+        return list(map(lambda c: c.value, cls))
+class FeatureSetFeatures(MonitoringStrEnum):
+    LATENCY = EventFieldType.LATENCY
+    ERROR_COUNT = EventFieldType.ERROR_COUNT
+    METRICS = EventFieldType.METRICS
+    @classmethod
+    def time_stamp(cls):
+        return EventFieldType.TIMESTAMP
+    @classmethod
+    def entity(cls):
+        return EventFieldType.ENDPOINT_ID
 class ApplicationEvent:
     APPLICATION_NAME = "application_name"
     CURRENT_STATS = "current_stats"
@@ -89,7 +109,7 @@ class ApplicationEvent:
     OUTPUT_STREAM_URI = "output_stream_uri"
-class WriterEvent(StrEnum):
+class WriterEvent(MonitoringStrEnum):
     APPLICATION_NAME = "application_name"
     ENDPOINT_ID = "endpoint_id"
     START_INFER_TIME = "start_infer_time"
@@ -101,10 +121,6 @@ class WriterEvent(StrEnum):
     RESULT_EXTRA_DATA = "result_extra_data"
     CURRENT_STATS = "current_stats"
-    @classmethod
-    def list(cls):
-        return list(map(lambda c: c.value, cls))
 class EventLiveStats:
     LATENCY_AVG_5M = "latency_avg_5m"

mlrun/config.py CHANGED Viewed

@@ -611,8 +611,9 @@ default_config = {
     },
     "workflows": {
         "default_workflow_runner_name": "workflow-runner-{}",
-        # Default timeout seconds for retrieving workflow id after execution:
-        "timeouts": {"local": 120, "kfp": 30, "remote": 90},
+        # Default timeout seconds for retrieving workflow id after execution
+        # Remote workflow timeout is the maximum between remote and the inner engine timeout
+        "timeouts": {"local": 120, "kfp": 60, "remote": 60 * 5},
     },
     "log_collector": {
         "address": "localhost:8282",
@@ -671,6 +672,10 @@ default_config = {
         "access_key": "",
     },
     "grafana_url": "",
+    "auth_with_client_id": {
+        "enabled": False,
+        "request_timeout": 5,
+    },
 }
 _is_running_as_api = None
@@ -1061,7 +1066,7 @@ class Config:
         target: str = "online",
         artifact_path: str = None,
         application_name: str = None,
-    ) -> str:
+    ) -> typing.Union[str, list[str]]:
         """Get the full path from the configuration based on the provided project and kind.
         :param project:         Project name.
@@ -1077,7 +1082,8 @@ class Config:
                                 relative artifact path will be taken from the global MLRun artifact path.
         :param application_name:    Application name, None for model_monitoring_stream.
-        :return:                Full configured path for the provided kind.
+        :return:                Full configured path for the provided kind. Can be either a single path
+                                or a list of paths in the case of the online model monitoring stream path.
         """
         if target != "offline":
@@ -1098,12 +1104,22 @@ class Config:
                     if application_name is None
                     else f"{kind}-{application_name.lower()}",
                 )
-            return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
-                project=project,
-                kind=kind
-                if application_name is None
-                else f"{kind}-{application_name.lower()}",
-            )
+            elif kind == "stream":  # return list for mlrun<1.6.3 BC
+                return [
+                    mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
+                        project=project,
+                        kind=kind,
+                    ),  # old stream uri (pipelines) for BC ML-6043
+                    mlrun.mlconf.model_endpoint_monitoring.store_prefixes.user_space.format(
+                        project=project,
+                        kind=kind,
+                    ),  # new stream uri (projects)
+                ]
+            else:
+                return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
+                    project=project,
+                    kind=kind,
+                )
         # Get the current offline path from the configuration
         file_path = mlrun.mlconf.model_endpoint_monitoring.offline_storage_path.format(
@@ -1360,10 +1376,14 @@ def read_env(env=None, prefix=env_prefix):
     if log_formatter_name := config.get("log_formatter"):
         import mlrun.utils.logger
-        log_formatter = mlrun.utils.create_formatter_instance(
+        log_formatter = mlrun.utils.resolve_formatter_by_kind(
             mlrun.utils.FormatterKinds(log_formatter_name)
         )
-        mlrun.utils.logger.get_handler("default").setFormatter(log_formatter)
+        current_handler = mlrun.utils.logger.get_handler("default")
+        current_formatter_name = current_handler.formatter.__class__.__name__
+        desired_formatter_name = log_formatter.__name__
+        if current_formatter_name != desired_formatter_name:
+            current_handler.setFormatter(log_formatter())
     # The default function pod resource values are of type str; however, when reading from environment variable numbers,
     # it converts them to type int if contains only number, so we want to convert them to str.

mlrun/data_types/data_types.py CHANGED Viewed

@@ -41,6 +41,7 @@ class ValueType(str, Enum):
     BYTES = "bytes"
     STRING = "str"
     DATETIME = "datetime"
+    LIST = "List"
     BYTES_LIST = "List[bytes]"
     STRING_LIST = "List[string]"
     INT32_LIST = "List[int32]"
@@ -48,6 +49,7 @@ class ValueType(str, Enum):
     DOUBLE_LIST = "List[float]"
     FLOAT_LIST = "List[float32]"
     BOOL_LIST = "List[bool]"
+    Tuple = "Tuple"
 def pd_schema_to_value_type(value):
@@ -102,6 +104,8 @@ def python_type_to_value_type(value_type):
         "datetime64[ns]": ValueType.INT64,
         "datetime64[ns, tz]": ValueType.INT64,
         "category": ValueType.STRING,
+        "list": ValueType.LIST,
+        "tuple": ValueType.Tuple,
     }
     if type_name in type_map:

mlrun/datastore/v3io.py CHANGED Viewed

@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import mmap
-import os
 import time
 from datetime import datetime
@@ -22,7 +20,6 @@ import v3io
 from v3io.dataplane.response import HttpResponseError
 import mlrun
-from mlrun.datastore.helpers import ONE_GB, ONE_MB
 from ..platforms.iguazio import parse_path, split_path
 from .base import (
@@ -32,6 +29,7 @@ from .base import (
 )
 V3IO_LOCAL_ROOT = "v3io"
+V3IO_DEFAULT_UPLOAD_CHUNK_SIZE = 1024 * 1024 * 100
 class V3ioStore(DataStore):
@@ -94,46 +92,28 @@ class V3ioStore(DataStore):
         )
         return self._sanitize_storage_options(res)
-    def _upload(self, key: str, src_path: str, max_chunk_size: int = ONE_GB):
+    def _upload(
+        self,
+        key: str,
+        src_path: str,
+        max_chunk_size: int = V3IO_DEFAULT_UPLOAD_CHUNK_SIZE,
+    ):
         """helper function for upload method, allows for controlling max_chunk_size in testing"""
         container, path = split_path(self._join(key))
-        file_size = os.path.getsize(src_path)  # in bytes
-        if file_size <= ONE_MB:
-            with open(src_path, "rb") as source_file:
-                data = source_file.read()
-            self._do_object_request(
-                self.object.put,
-                container=container,
-                path=path,
-                body=data,
-                append=False,
-            )
-            return
-        # chunk must be a multiple of the ALLOCATIONGRANULARITY
-        # https://docs.python.org/3/library/mmap.html
-        if residue := max_chunk_size % mmap.ALLOCATIONGRANULARITY:
-            # round down to the nearest multiple of ALLOCATIONGRANULARITY
-            max_chunk_size -= residue
         with open(src_path, "rb") as file_obj:
-            file_offset = 0
-            while file_offset < file_size:
-                chunk_size = min(file_size - file_offset, max_chunk_size)
-                with mmap.mmap(
-                    file_obj.fileno(),
-                    length=chunk_size,
-                    access=mmap.ACCESS_READ,
-                    offset=file_offset,
-                ) as mmap_obj:
-                    append = file_offset != 0
-                    self._do_object_request(
-                        self.object.put,
-                        container=container,
-                        path=path,
-                        body=mmap_obj,
-                        append=append,
-                    )
-                    file_offset += chunk_size
+            append = False
+            while True:
+                data = memoryview(file_obj.read(max_chunk_size))
+                if not data:
+                    break
+                self._do_object_request(
+                    self.object.put,
+                    container=container,
+                    path=path,
+                    body=data,
+                    append=append,
+                )
+                append = True
     def upload(self, key, src_path):
         return self._upload(key, src_path)
@@ -148,19 +128,16 @@ class V3ioStore(DataStore):
             num_bytes=size,
         ).body
-    def _put(self, key, data, append=False, max_chunk_size: int = ONE_GB):
+    def _put(
+        self,
+        key,
+        data,
+        append=False,
+        max_chunk_size: int = V3IO_DEFAULT_UPLOAD_CHUNK_SIZE,
+    ):
         """helper function for put method, allows for controlling max_chunk_size in testing"""
         container, path = split_path(self._join(key))
         buffer_size = len(data)  # in bytes
-        if buffer_size <= ONE_MB:
-            self._do_object_request(
-                self.object.put,
-                container=container,
-                path=path,
-                body=data,
-                append=append,
-            )
-            return
         buffer_offset = 0
         try:
             data = memoryview(data)

mlrun/db/auth_utils.py ADDED Viewed

@@ -0,0 +1,152 @@
+# Copyright 2024 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC, abstractmethod
+from datetime import datetime, timedelta
+import requests
+import mlrun.errors
+from mlrun.utils import logger
+class TokenProvider(ABC):
+    @abstractmethod
+    def get_token(self):
+        pass
+    @abstractmethod
+    def is_iguazio_session(self):
+        pass
+class StaticTokenProvider(TokenProvider):
+    def __init__(self, token: str):
+        self.token = token
+    def get_token(self):
+        return self.token
+    def is_iguazio_session(self):
+        return mlrun.platforms.iguazio.is_iguazio_session(self.token)
+class OAuthClientIDTokenProvider(TokenProvider):
+    def __init__(
+        self, token_endpoint: str, client_id: str, client_secret: str, timeout=5
+    ):
+        if not token_endpoint or not client_id or not client_secret:
+            raise mlrun.errors.MLRunValueError(
+                "Invalid client_id configuration for authentication. Must provide token endpoint, client-id and secret"
+            )
+        self.token_endpoint = token_endpoint
+        self.client_id = client_id
+        self.client_secret = client_secret
+        self.timeout = timeout
+        # Since we're only issuing POST requests, which are actually a disguised GET, then it's ok to allow retries
+        # on them.
+        self._session = mlrun.utils.HTTPSessionWithRetry(
+            retry_on_post=True,
+            verbose=True,
+        )
+        self._cleanup()
+        self._refresh_token_if_needed()
+    def get_token(self):
+        self._refresh_token_if_needed()
+        return self.token
+    def is_iguazio_session(self):
+        return False
+    def _cleanup(self):
+        self.token = self.token_expiry_time = self.token_refresh_time = None
+    def _refresh_token_if_needed(self):
+        now = datetime.now()
+        if self.token:
+            if self.token_refresh_time and now <= self.token_refresh_time:
+                return self.token
+            # We only cleanup if token was really expired - even if we fail in refreshing the token, we can still
+            # use the existing one given that it's not expired.
+            if now >= self.token_expiry_time:
+                self._cleanup()
+        self._issue_token_request()
+        return self.token
+    def _issue_token_request(self, raise_on_error=False):
+        try:
+            headers = {"Content-Type": "application/x-www-form-urlencoded"}
+            request_body = {
+                "grant_type": "client_credentials",
+                "client_id": self.client_id,
+                "client_secret": self.client_secret,
+            }
+            response = self._session.request(
+                "POST",
+                self.token_endpoint,
+                timeout=self.timeout,
+                headers=headers,
+                data=request_body,
+            )
+        except requests.RequestException as exc:
+            error = f"Retrieving token failed: {mlrun.errors.err_to_str(exc)}"
+            if raise_on_error:
+                raise mlrun.errors.MLRunRuntimeError(error) from exc
+            else:
+                logger.warning(error)
+                return
+        if not response.ok:
+            error = "No error available"
+            if response.content:
+                try:
+                    data = response.json()
+                    error = data.get("error")
+                except Exception:
+                    pass
+            logger.warning(
+                "Retrieving token failed", status=response.status_code, error=error
+            )
+            if raise_on_error:
+                mlrun.errors.raise_for_status(response)
+            return
+        self._parse_response(response.json())
+    def _parse_response(self, data: dict):
+        # Response is described in https://datatracker.ietf.org/doc/html/rfc6749#section-4.4.3
+        # According to spec, there isn't a refresh token - just the access token and its expiry time (in seconds).
+        self.token = data.get("access_token")
+        expires_in = data.get("expires_in")
+        if not self.token or not expires_in:
+            token_str = "****" if self.token else "missing"
+            logger.warning(
+                "Failed to parse token response", token=token_str, expires_in=expires_in
+            )
+            return
+        now = datetime.now()
+        self.token_expiry_time = now + timedelta(seconds=expires_in)
+        self.token_refresh_time = now + timedelta(seconds=expires_in / 2)
+        logger.info(
+            "Successfully retrieved client-id token",
+            expires_in=expires_in,
+            expiry=str(self.token_expiry_time),
+            refresh=str(self.token_refresh_time),
+        )

mlrun 1.6.2rc5__py3-none-any.whl → 1.6.3__py3-none-any.whl

Potentially problematic release.

mlrun 1.6.2rc5py3-none-any.whl → 1.6.3py3-none-any.whl