PyPI - mlrun - Versions diffs - 1.7.0rc1__py3-none-any.whl → 1.7.0rc2__py3-none-any.whl - Mend

mlrun 1.7.0rc1py3-none-any.whl → 1.7.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (29) hide show

mlrun/artifacts/model.py +8 -1
mlrun/common/db/sql_session.py +3 -0
mlrun/config.py +10 -3
mlrun/datastore/base.py +0 -28
mlrun/datastore/datastore_profile.py +12 -0
mlrun/datastore/sources.py +1 -5
mlrun/datastore/targets.py +9 -5
mlrun/datastore/v3io.py +70 -46
mlrun/feature_store/api.py +56 -56
mlrun/feature_store/feature_set.py +0 -2
mlrun/feature_store/feature_vector.py +120 -0
mlrun/feature_store/steps.py +1 -9
mlrun/features.py +0 -2
mlrun/k8s_utils.py +51 -0
mlrun/model_monitoring/stream_processing.py +3 -21
mlrun/projects/project.py +45 -7
mlrun/serving/remote.py +0 -4
mlrun/serving/routers.py +14 -6
mlrun/serving/states.py +1 -0
mlrun/serving/v2_serving.py +45 -3
mlrun/utils/helpers.py +5 -2
mlrun/utils/regex.py +5 -1
mlrun/utils/version/version.json +2 -2
{mlrun-1.7.0rc1.dist-info → mlrun-1.7.0rc2.dist-info}/METADATA +10 -10
{mlrun-1.7.0rc1.dist-info → mlrun-1.7.0rc2.dist-info}/RECORD +29 -29
{mlrun-1.7.0rc1.dist-info → mlrun-1.7.0rc2.dist-info}/LICENSE +0 -0
{mlrun-1.7.0rc1.dist-info → mlrun-1.7.0rc2.dist-info}/WHEEL +0 -0
{mlrun-1.7.0rc1.dist-info → mlrun-1.7.0rc2.dist-info}/entry_points.txt +0 -0
{mlrun-1.7.0rc1.dist-info → mlrun-1.7.0rc2.dist-info}/top_level.txt +0 -0

mlrun/artifacts/model.py CHANGED Viewed

@@ -13,7 +13,9 @@
 # limitations under the License.
 import tempfile
 from os import path
+from typing import Any
+import pandas as pd
 import yaml
 from deprecated import deprecated
@@ -259,6 +261,7 @@ class ModelArtifact(Artifact):
         """
         subset = df
         inferer = get_infer_interface(subset)
+        numeric_columns = self._extract_numeric_features(df)
         if label_columns:
             if not isinstance(label_columns, list):
                 label_columns = [label_columns]
@@ -272,9 +275,13 @@ class ModelArtifact(Artifact):
             )
         if with_stats:
             self.spec.feature_stats = inferer.get_stats(
-                df, options=InferOptions.Histogram, num_bins=num_bins
+                df[numeric_columns], options=InferOptions.Histogram, num_bins=num_bins
             )
+    @staticmethod
+    def _extract_numeric_features(df: pd.DataFrame) -> list[Any]:
+        return [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col])]
     @property
     def is_dir(self):
         return True

mlrun/common/db/sql_session.py CHANGED Viewed

@@ -62,9 +62,12 @@ def _init_engine(dsn=None):
         max_overflow = config.httpdb.db.connections_pool_max_overflow
         if max_overflow is None:
             max_overflow = config.httpdb.max_workers
         kwargs = {
             "pool_size": pool_size,
             "max_overflow": max_overflow,
+            "pool_pre_ping": config.httpdb.db.connections_pool_pre_ping,
+            "pool_recycle": config.httpdb.db.connections_pool_recycle,
         }
     engine = create_engine(dsn, **kwargs)
     _engines[dsn] = engine

mlrun/config.py CHANGED Viewed

@@ -109,7 +109,10 @@ default_config = {
         "runs": {
             # deleting runs is a heavy operation that includes deleting runtime resources, therefore we do it in chunks
             "batch_delete_runs_chunk_size": 10,
-        }
+        },
+        "resources": {
+            "delete_crd_resources_timeout": "5 minutes",
+        },
     },
     # the grace period (in seconds) that will be given to runtime resources (after they're in terminal state)
     # before deleting them (4 hours)
@@ -303,7 +306,11 @@ default_config = {
                 # default is 16MB, max 1G, for more info https://dev.mysql.com/doc/refman/8.0/en/packet-too-large.html
                 "max_allowed_packet": 64000000,  # 64MB
             },
-            # None will set this to be equal to the httpdb.max_workers
+            # tests connections for liveness upon each checkout
+            "connections_pool_pre_ping": True,
+            # this setting causes the pool to recycle connections after the given number of seconds has passed
+            "connections_pool_recycle": 60 * 60,
+            # None defaults to httpdb.max_workers
             "connections_pool_size": None,
             "connections_pool_max_overflow": None,
             # below is a db-specific configuration
@@ -408,7 +415,7 @@ default_config = {
             "iguazio_access_key": "",
             "iguazio_list_projects_default_page_size": 200,
             "iguazio_client_job_cache_ttl": "20 minutes",
-            "nuclio_project_deletion_verification_timeout": "60 seconds",
+            "nuclio_project_deletion_verification_timeout": "300 seconds",
             "nuclio_project_deletion_verification_interval": "5 seconds",
         },
         # The API needs to know what is its k8s svc url so it could enrich it in the jobs it creates

mlrun/datastore/base.py CHANGED Viewed

@@ -654,34 +654,6 @@ def http_get(url, headers=None, auth=None):
     return response.content
-def http_head(url, headers=None, auth=None):
-    try:
-        response = requests.head(url, headers=headers, auth=auth, verify=verify_ssl)
-    except OSError as exc:
-        raise OSError(f"error: cannot connect to {url}: {err_to_str(exc)}")
-    mlrun.errors.raise_for_status(response)
-    return response.headers
-def http_put(url, data, headers=None, auth=None, session=None):
-    try:
-        put_api = session.put if session else requests.put
-        response = put_api(
-            url, data=data, headers=headers, auth=auth, verify=verify_ssl
-        )
-    except OSError as exc:
-        raise OSError(f"error: cannot connect to {url}: {err_to_str(exc)}") from exc
-    mlrun.errors.raise_for_status(response)
-def http_upload(url, file_path, headers=None, auth=None):
-    with open(file_path, "rb") as data:
-        http_put(url, data, headers, auth)
 class HttpStore(DataStore):
     def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
         super().__init__(parent, name, schema, endpoint, secrets)

mlrun/datastore/datastore_profile.py CHANGED Viewed

@@ -382,6 +382,18 @@ def datastore_profile_read(url, project_name="", secrets: dict = None):
     public_profile = mlrun.db.get_run_db().get_datastore_profile(
         profile_name, project_name
     )
+    # The mlrun.db.get_run_db().get_datastore_profile() function is capable of returning
+    # two distinct types of objects based on its execution context.
+    # If it operates from the client or within the pod (which is the common scenario),
+    # it yields an instance of `mlrun.datastore.DatastoreProfile`. Conversely,
+    # when executed on the server with a direct call to `sqldb`, it produces an instance of
+    # mlrun.common.schemas.DatastoreProfile.
+    # In the latter scenario, an extra conversion step is required to transform the object
+    # into mlrun.datastore.DatastoreProfile.
+    if isinstance(public_profile, mlrun.common.schemas.DatastoreProfile):
+        public_profile = DatastoreProfile2Json.create_from_json(
+            public_json=public_profile.object
+        )
     project_ds_name_private = DatastoreProfile.generate_secret_key(
         profile_name, project_name
     )

mlrun/datastore/sources.py CHANGED Viewed

@@ -848,8 +848,6 @@ class HttpSource(OnlineSource):
 class StreamSource(OnlineSource):
-    """Sets stream source for the flow. If stream doesn't exist it will create it"""
     kind = "v3ioStream"
     def __init__(
@@ -863,7 +861,7 @@ class StreamSource(OnlineSource):
         **kwargs,
     ):
         """
-        Sets stream source for the flow. If stream doesn't exist it will create it
+        Sets the stream source for the flow. If the stream doesn't exist it will create it.
         :param name: stream name. Default "stream"
         :param group: consumer group. Default "serving"
@@ -915,8 +913,6 @@ class StreamSource(OnlineSource):
 class KafkaSource(OnlineSource):
-    """Sets kafka source for the flow"""
     kind = "kafka"
     def __init__(

mlrun/datastore/targets.py CHANGED Viewed

@@ -727,7 +727,7 @@ class BaseStoreTarget(DataTargetBase):
 class ParquetTarget(BaseStoreTarget):
-    """parquet target storage driver, used to materialize feature set/vector data into parquet files
+    """Parquet target storage driver, used to materialize feature set/vector data into parquet files.
     :param name:       optional, target name. By default will be called ParquetTarget
     :param path:       optional, Output path. Can be either a file or directory.
@@ -1911,12 +1911,16 @@ class SQLTarget(BaseStoreTarget):
                 # creat new table with the given name
                 columns = []
                 for col, col_type in self.schema.items():
-                    col_type = TYPE_TO_SQL_TYPE.get(col_type)
-                    if col_type is None:
-                        raise TypeError(f"{col_type} unsupported type")
+                    col_type_sql = TYPE_TO_SQL_TYPE.get(col_type)
+                    if col_type_sql is None:
+                        raise TypeError(
+                            f"'{col_type}' unsupported type for column '{col}'"
+                        )
                     columns.append(
                         sqlalchemy.Column(
-                            col, col_type, primary_key=(col in primary_key_for_check)
+                            col,
+                            col_type_sql,
+                            primary_key=(col in primary_key_for_check),
                         )
                     )

mlrun/datastore/v3io.py CHANGED Viewed

@@ -15,12 +15,11 @@
 import mmap
 import os
 import time
-from copy import deepcopy
 from datetime import datetime
 import fsspec
-import requests
-import v3io.dataplane
+import v3io
+from v3io.dataplane.response import HttpResponseError
 import mlrun
 from mlrun.datastore.helpers import ONE_GB, ONE_MB
@@ -30,11 +29,6 @@ from .base import (
     DataStore,
     FileStats,
     basic_auth_header,
-    get_range,
-    http_get,
-    http_head,
-    http_put,
-    http_upload,
 )
 V3IO_LOCAL_ROOT = "v3io"
@@ -47,17 +41,18 @@ class V3ioStore(DataStore):
         self.headers = None
         self.secure = self.kind == "v3ios"
+        token = self._get_secret_or_env("V3IO_ACCESS_KEY")
+        username = self._get_secret_or_env("V3IO_USERNAME")
+        password = self._get_secret_or_env("V3IO_PASSWORD")
         if self.endpoint.startswith("https://"):
             self.endpoint = self.endpoint[len("https://") :]
             self.secure = True
         elif self.endpoint.startswith("http://"):
             self.endpoint = self.endpoint[len("http://") :]
             self.secure = False
-        token = self._get_secret_or_env("V3IO_ACCESS_KEY")
-        username = self._get_secret_or_env("V3IO_USERNAME")
-        password = self._get_secret_or_env("V3IO_PASSWORD")
+        self.client = v3io.dataplane.Client(access_key=token, endpoint=self.url)
+        self.object = self.client.object
         self.auth = None
         self.token = token
         if token:
@@ -65,6 +60,16 @@ class V3ioStore(DataStore):
         elif username and password:
             self.headers = basic_auth_header(username, password)
+    @staticmethod
+    def _do_object_request(function: callable, *args, **kwargs):
+        try:
+            return function(*args, **kwargs)
+        except HttpResponseError as http_response_error:
+            raise mlrun.errors.err_for_status_code(
+                status_code=http_response_error.status_code,
+                message=mlrun.errors.err_to_str(http_response_error),
+            )
     @staticmethod
     def uri_to_ipython(endpoint, subpath):
         return V3IO_LOCAL_ROOT + subpath
@@ -91,13 +96,19 @@ class V3ioStore(DataStore):
     def _upload(self, key: str, src_path: str, max_chunk_size: int = ONE_GB):
         """helper function for upload method, allows for controlling max_chunk_size in testing"""
+        container, path = split_path(self._join(key))
         file_size = os.path.getsize(src_path)  # in bytes
         if file_size <= ONE_MB:
-            http_upload(self.url + self._join(key), src_path, self.headers, None)
+            with open(src_path, "rb") as source_file:
+                data = source_file.read()
+            self._do_object_request(
+                self.object.put,
+                container=container,
+                path=path,
+                body=data,
+                append=False,
+            )
             return
-        append_header = deepcopy(self.headers)
-        append_header["Range"] = "-1"
         # chunk must be a multiple of the ALLOCATIONGRANULARITY
         # https://docs.python.org/3/library/mmap.html
         if residue := max_chunk_size % mmap.ALLOCATIONGRANULARITY:
@@ -114,11 +125,13 @@ class V3ioStore(DataStore):
                     access=mmap.ACCESS_READ,
                     offset=file_offset,
                 ) as mmap_obj:
-                    http_put(
-                        self.url + self._join(key),
-                        mmap_obj,
-                        append_header if file_offset else self.headers,
-                        None,
+                    append = file_offset != 0
+                    self._do_object_request(
+                        self.object.put,
+                        container=container,
+                        path=path,
+                        body=mmap_obj,
+                        append=append,
                     )
                     file_offset += chunk_size
@@ -126,43 +139,55 @@ class V3ioStore(DataStore):
         return self._upload(key, src_path)
     def get(self, key, size=None, offset=0):
-        headers = self.headers
-        if size or offset:
-            headers = deepcopy(headers)
-            headers["Range"] = get_range(size, offset)
-        return http_get(self.url + self._join(key), headers)
+        container, path = split_path(self._join(key))
+        return self._do_object_request(
+            function=self.object.get,
+            container=container,
+            path=path,
+            offset=offset,
+            num_bytes=size,
+        ).body
-    def _put(self, key, data, max_chunk_size: int = ONE_GB):
+    def _put(self, key, data, append=False, max_chunk_size: int = ONE_GB):
         """helper function for put method, allows for controlling max_chunk_size in testing"""
+        container, path = split_path(self._join(key))
         buffer_size = len(data)  # in bytes
         if buffer_size <= ONE_MB:
-            http_put(self.url + self._join(key), data, self.headers, None)
+            self._do_object_request(
+                self.object.put,
+                container=container,
+                path=path,
+                body=data,
+                append=append,
+            )
             return
-        append_header = deepcopy(self.headers)
-        append_header["Range"] = "-1"
         buffer_offset = 0
         try:
             data = memoryview(data)
         except TypeError:
             pass
-        with requests.Session() as requests_session:
-            while buffer_offset < buffer_size:
-                chunk_size = min(buffer_size - buffer_offset, max_chunk_size)
-                http_put(
-                    self.url + self._join(key),
-                    data[buffer_offset : buffer_offset + chunk_size],
-                    append_header if buffer_offset else self.headers,
-                    None,
-                    requests_session,
-                )
-                buffer_offset += chunk_size
+        while buffer_offset < buffer_size:
+            chunk_size = min(buffer_size - buffer_offset, max_chunk_size)
+            append = True if buffer_offset or append else False
+            self._do_object_request(
+                self.object.put,
+                container=container,
+                path=path,
+                body=data[buffer_offset : buffer_offset + chunk_size],
+                append=append,
+            )
+            buffer_offset += chunk_size
     def put(self, key, data, append=False):
-        return self._put(key, data)
+        return self._put(key, data, append)
     def stat(self, key):
-        head = http_head(self.url + self._join(key), self.headers)
+        container, path = split_path(self._join(key))
+        response = self._do_object_request(
+            function=self.object.head, container=container, path=path
+        )
+        head = dict(response.headers)
         size = int(head.get("Content-Length", "0"))
         datestr = head.get("Last-Modified", "0")
         modified = time.mktime(
@@ -171,7 +196,6 @@ class V3ioStore(DataStore):
         return FileStats(size, modified)
     def listdir(self, key):
-        v3io_client = v3io.dataplane.Client(endpoint=self.url, access_key=self.token)
         container, subpath = split_path(self._join(key))
         if not subpath.endswith("/"):
             subpath += "/"
@@ -180,7 +204,7 @@ class V3ioStore(DataStore):
         subpath_length = len(subpath) - 1
         try:
-            response = v3io_client.container.list(
+            response = self.client.container.list(
                 container=container,
                 path=subpath,
                 get_all_attributes=False,

mlrun/feature_store/api.py CHANGED Viewed

@@ -114,44 +114,6 @@ def get_offline_features(
     spark_service: str = None,
     timestamp_for_filtering: Union[str, dict[str, str]] = None,
 ):
-    return _get_offline_features(
-        feature_vector,
-        entity_rows,
-        entity_timestamp_column,
-        target,
-        run_config,
-        drop_columns,
-        start_time,
-        end_time,
-        with_indexes,
-        update_stats,
-        engine,
-        engine_args,
-        query,
-        order_by,
-        spark_service,
-        timestamp_for_filtering,
-    )
-def _get_offline_features(
-    feature_vector: Union[str, FeatureVector],
-    entity_rows=None,
-    entity_timestamp_column: str = None,
-    target: DataTargetBase = None,
-    run_config: RunConfig = None,
-    drop_columns: list[str] = None,
-    start_time: Union[str, datetime] = None,
-    end_time: Union[str, datetime] = None,
-    with_indexes: bool = False,
-    update_stats: bool = False,
-    engine: str = None,
-    engine_args: dict = None,
-    query: str = None,
-    order_by: Union[str, list[str]] = None,
-    spark_service: str = None,
-    timestamp_for_filtering: Union[str, dict[str, str]] = None,
-) -> Union[OfflineVectorResponse, RemoteVectorResponse]:
     """retrieve offline feature vector results
     specify a feature vector object/uri and retrieve the desired features, their metadata
@@ -212,6 +174,44 @@ def _get_offline_features(
                                     merge process using start_time and end_time params.
     """
+    return _get_offline_features(
+        feature_vector,
+        entity_rows,
+        entity_timestamp_column,
+        target,
+        run_config,
+        drop_columns,
+        start_time,
+        end_time,
+        with_indexes,
+        update_stats,
+        engine,
+        engine_args,
+        query,
+        order_by,
+        spark_service,
+        timestamp_for_filtering,
+    )
+def _get_offline_features(
+    feature_vector: Union[str, FeatureVector],
+    entity_rows=None,
+    entity_timestamp_column: str = None,
+    target: DataTargetBase = None,
+    run_config: RunConfig = None,
+    drop_columns: list[str] = None,
+    start_time: Union[str, datetime] = None,
+    end_time: Union[str, datetime] = None,
+    with_indexes: bool = False,
+    update_stats: bool = False,
+    engine: str = None,
+    engine_args: dict = None,
+    query: str = None,
+    order_by: Union[str, list[str]] = None,
+    spark_service: str = None,
+    timestamp_for_filtering: Union[str, dict[str, str]] = None,
+) -> Union[OfflineVectorResponse, RemoteVectorResponse]:
     if entity_rows is None and entity_timestamp_column is not None:
         raise mlrun.errors.MLRunInvalidArgumentError(
             "entity_timestamp_column param "
@@ -281,24 +281,6 @@ def get_online_feature_service(
     update_stats: bool = False,
     entity_keys: list[str] = None,
 ):
-    return _get_online_feature_service(
-        feature_vector,
-        run_config,
-        fixed_window_type,
-        impute_policy,
-        update_stats,
-        entity_keys,
-    )
-def _get_online_feature_service(
-    feature_vector: Union[str, FeatureVector],
-    run_config: RunConfig = None,
-    fixed_window_type: FixedWindowType = FixedWindowType.LastClosedWindow,
-    impute_policy: dict = None,
-    update_stats: bool = False,
-    entity_keys: list[str] = None,
-) -> OnlineVectorService:
     """initialize and return online feature vector service api,
     returns :py:class:`~mlrun.feature_store.OnlineVectorService`
@@ -362,6 +344,24 @@ def _get_online_feature_service(
     :return:                    Initialize the `OnlineVectorService`.
                                 Will be used in subclasses where `support_online=True`.
     """
+    return _get_online_feature_service(
+        feature_vector,
+        run_config,
+        fixed_window_type,
+        impute_policy,
+        update_stats,
+        entity_keys,
+    )
+def _get_online_feature_service(
+    feature_vector: Union[str, FeatureVector],
+    run_config: RunConfig = None,
+    fixed_window_type: FixedWindowType = FixedWindowType.LastClosedWindow,
+    impute_policy: dict = None,
+    update_stats: bool = False,
+    entity_keys: list[str] = None,
+) -> OnlineVectorService:
     if isinstance(feature_vector, FeatureVector):
         update_stats = True
     feature_vector = _features_to_vector_and_check_permissions(

mlrun/feature_store/feature_set.py CHANGED Viewed

@@ -318,8 +318,6 @@ def emit_policy_to_dict(policy: EmitPolicy):
 class FeatureSet(ModelObj):
-    """Feature set object, defines a set of features and their data pipeline"""
     kind = mlrun.common.schemas.ObjectKind.feature_set.value
     _dict_fields = ["kind", "metadata", "spec", "status"]

mlrun 1.7.0rc1__py3-none-any.whl → 1.7.0rc2__py3-none-any.whl

Potentially problematic release.

mlrun 1.7.0rc1py3-none-any.whl → 1.7.0rc2py3-none-any.whl