PyPI - mlrun - Versions diffs - 1.7.0rc3__py3-none-any.whl → 1.7.0rc4__py3-none-any.whl - Mend

mlrun 1.7.0rc3py3-none-any.whl → 1.7.0rc4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (69) hide show

mlrun/artifacts/manager.py +6 -1
mlrun/common/constants.py +1 -0
mlrun/common/model_monitoring/helpers.py +12 -6
mlrun/common/schemas/__init__.py +1 -0
mlrun/common/schemas/client_spec.py +1 -0
mlrun/common/schemas/common.py +40 -0
mlrun/common/schemas/model_monitoring/constants.py +4 -1
mlrun/common/schemas/project.py +2 -0
mlrun/config.py +20 -15
mlrun/datastore/azure_blob.py +22 -9
mlrun/datastore/base.py +15 -25
mlrun/datastore/datastore.py +19 -8
mlrun/datastore/datastore_profile.py +47 -5
mlrun/datastore/google_cloud_storage.py +10 -6
mlrun/datastore/hdfs.py +51 -0
mlrun/datastore/redis.py +4 -0
mlrun/datastore/s3.py +4 -0
mlrun/datastore/sources.py +29 -43
mlrun/datastore/targets.py +58 -48
mlrun/datastore/utils.py +2 -49
mlrun/datastore/v3io.py +4 -0
mlrun/db/base.py +34 -0
mlrun/db/httpdb.py +71 -42
mlrun/execution.py +3 -3
mlrun/feature_store/feature_vector.py +2 -2
mlrun/frameworks/tf_keras/callbacks/logging_callback.py +3 -3
mlrun/frameworks/tf_keras/model_handler.py +7 -7
mlrun/k8s_utils.py +10 -5
mlrun/kfpops.py +19 -10
mlrun/model.py +5 -0
mlrun/model_monitoring/api.py +3 -3
mlrun/model_monitoring/application.py +1 -1
mlrun/model_monitoring/applications/__init__.py +13 -0
mlrun/model_monitoring/applications/histogram_data_drift.py +218 -0
mlrun/model_monitoring/batch.py +9 -111
mlrun/model_monitoring/controller.py +73 -55
mlrun/model_monitoring/controller_handler.py +13 -5
mlrun/model_monitoring/features_drift_table.py +62 -53
mlrun/model_monitoring/helpers.py +30 -21
mlrun/model_monitoring/metrics/__init__.py +13 -0
mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
mlrun/model_monitoring/stores/kv_model_endpoint_store.py +14 -14
mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -1
mlrun/package/packagers/pandas_packagers.py +3 -3
mlrun/package/utils/_archiver.py +3 -1
mlrun/platforms/iguazio.py +8 -65
mlrun/projects/pipelines.py +21 -11
mlrun/projects/project.py +121 -42
mlrun/runtimes/base.py +21 -2
mlrun/runtimes/kubejob.py +5 -3
mlrun/runtimes/local.py +2 -2
mlrun/runtimes/mpijob/abstract.py +6 -6
mlrun/runtimes/nuclio/function.py +9 -9
mlrun/runtimes/nuclio/serving.py +3 -3
mlrun/runtimes/pod.py +3 -3
mlrun/runtimes/sparkjob/spark3job.py +3 -3
mlrun/serving/remote.py +4 -2
mlrun/serving/server.py +2 -8
mlrun/utils/async_http.py +3 -3
mlrun/utils/helpers.py +27 -5
mlrun/utils/http.py +3 -3
mlrun/utils/notifications/notification_pusher.py +6 -6
mlrun/utils/version/version.json +2 -2
{mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc4.dist-info}/METADATA +13 -16
{mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc4.dist-info}/RECORD +69 -63
{mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc4.dist-info}/LICENSE +0 -0
{mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc4.dist-info}/WHEEL +0 -0
{mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc4.dist-info}/entry_points.txt +0 -0
{mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc4.dist-info}/top_level.txt +0 -0

mlrun/datastore/sources.py CHANGED Viewed

@@ -39,7 +39,6 @@ from .utils import (
     _generate_sql_query_with_time_filter,
     filter_df_start_end_time,
     select_columns_from_df,
-    store_path_to_spark,
 )
@@ -193,14 +192,10 @@ class CSVSource(BaseSourceDriver):
             parse_dates.append(time_field)
         data_item = mlrun.store_manager.object(self.path)
-        if self.path and self.path.startswith("ds://"):
-            store, path = mlrun.store_manager.get_or_create_store(self.path)
-            path = store.url + path
-        else:
-            path = data_item.url
+        store, path, url = mlrun.store_manager.get_or_create_store(self.path)
         return storey.CSVSource(
-            paths=path,  # unlike self.path, it already has store:// replaced
+            paths=url,  # unlike self.path, it already has store:// replaced
             build_dict=True,
             key_field=self.key_field or key_field,
             storage_options=data_item.store.get_storage_options(),
@@ -209,25 +204,17 @@ class CSVSource(BaseSourceDriver):
         )
     def get_spark_options(self):
-        if self.path and self.path.startswith("ds://"):
-            store, path = mlrun.store_manager.get_or_create_store(self.path)
-            storage_spark_options = store.get_spark_options()
-            path = store.url + path
-            result = {
-                "path": store_path_to_spark(path, storage_spark_options),
-                "format": "csv",
-                "header": "true",
-                "inferSchema": "true",
-            }
-            return {**result, **storage_spark_options}
-        else:
-            return {
-                "path": store_path_to_spark(self.path),
+        store, path, url = mlrun.store_manager.get_or_create_store(self.path)
+        spark_options = store.get_spark_options()
+        spark_options.update(
+            {
+                "path": url,
                 "format": "csv",
                 "header": "true",
                 "inferSchema": "true",
             }
+        )
+        return spark_options
     def to_spark_df(self, session, named_view=False, time_field=None, columns=None):
         import pyspark.sql.functions as funcs
@@ -357,14 +344,10 @@ class ParquetSource(BaseSourceDriver):
             attributes["context"] = context
         data_item = mlrun.store_manager.object(self.path)
-        if self.path and self.path.startswith("ds://"):
-            store, path = mlrun.store_manager.get_or_create_store(self.path)
-            path = store.url + path
-        else:
-            path = data_item.url
+        store, path, url = mlrun.store_manager.get_or_create_store(self.path)
         return storey.ParquetSource(
-            paths=path,  # unlike self.path, it already has store:// replaced
+            paths=url,  # unlike self.path, it already has store:// replaced
             key_field=self.key_field or key_field,
             storage_options=data_item.store.get_storage_options(),
             end_filter=self.end_time,
@@ -374,20 +357,15 @@ class ParquetSource(BaseSourceDriver):
         )
     def get_spark_options(self):
-        if self.path and self.path.startswith("ds://"):
-            store, path = mlrun.store_manager.get_or_create_store(self.path)
-            storage_spark_options = store.get_spark_options()
-            path = store.url + path
-            result = {
-                "path": store_path_to_spark(path, storage_spark_options),
-                "format": "parquet",
-            }
-            return {**result, **storage_spark_options}
-        else:
-            return {
-                "path": store_path_to_spark(self.path),
+        store, path, url = mlrun.store_manager.get_or_create_store(self.path)
+        spark_options = store.get_spark_options()
+        spark_options.update(
+            {
+                "path": store.spark_url + path,
                 "format": "parquet",
             }
+        )
+        return spark_options
     def to_dataframe(
         self,
@@ -875,8 +853,16 @@ class StreamSource(OnlineSource):
         super().__init__(name, attributes=attrs, **kwargs)
     def add_nuclio_trigger(self, function):
-        endpoint, stream_path = parse_path(self.path)
-        v3io_client = v3io.dataplane.Client(endpoint=endpoint)
+        store, path, url = mlrun.store_manager.get_or_create_store(self.path)
+        if store.kind != "v3io":
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                "Only profiles that reference the v3io datastore can be used with StreamSource"
+            )
+        path = "v3io:/" + path
+        storage_options = store.get_storage_options()
+        access_key = storage_options.get("v3io_access_key")
+        endpoint, stream_path = parse_path(url)
+        v3io_client = v3io.dataplane.Client(endpoint=endpoint, access_key=access_key)
         container, stream_path = split_path(stream_path)
         res = v3io_client.stream.create(
             container=container,
@@ -896,7 +882,7 @@ class StreamSource(OnlineSource):
             kwargs["worker_allocation_mode"] = "static"
         function.add_v3io_stream_trigger(
-            self.path,
+            path,
             self.name,
             self.attributes["group"],
             self.attributes["seek_to"],

mlrun/datastore/targets.py CHANGED Viewed

@@ -29,7 +29,7 @@ import mlrun
 import mlrun.utils.helpers
 from mlrun.config import config
 from mlrun.model import DataSource, DataTarget, DataTargetBase, TargetPathObject
-from mlrun.utils import now_date
+from mlrun.utils import logger, now_date
 from mlrun.utils.helpers import to_parquet
 from mlrun.utils.v3io_clients import get_frames_client
@@ -43,7 +43,6 @@ from .utils import (
     filter_df_start_end_time,
     parse_kafka_url,
     select_columns_from_df,
-    store_path_to_spark,
 )
@@ -448,14 +447,11 @@ class BaseStoreTarget(DataTargetBase):
             if self.credentials_prefix
             else None
         )
-        store, resolved_store_path = mlrun.store_manager.get_or_create_store(
+        store, resolved_store_path, url = mlrun.store_manager.get_or_create_store(
             self.get_target_path(),
             credentials_prefix_secrets,
         )
-        if self.get_target_path() and self.get_target_path().startswith("ds://"):
-            return store, store.url + resolved_store_path
-        else:
-            return store, self.get_target_path()
+        return store, url
     def _get_column_list(self, features, timestamp_key, key_columns, with_type=False):
         result = []
@@ -925,27 +921,21 @@ class ParquetTarget(BaseStoreTarget):
                     if unit == time_partitioning_granularity:
                         break
-        if self.path and self.path.startswith("ds://"):
-            store, path = mlrun.store_manager.get_or_create_store(
-                self.get_target_path()
-            )
-            storage_spark_options = store.get_spark_options()
-            path = store.url + path
-            result = {
-                "path": store_path_to_spark(path, storage_spark_options),
-                "format": "parquet",
-            }
-            result = {**result, **storage_spark_options}
-        else:
-            result = {
-                "path": store_path_to_spark(self.get_target_path()),
+        store, path, url = mlrun.store_manager.get_or_create_store(
+            self.get_target_path()
+        )
+        spark_options = store.get_spark_options()
+        spark_options.update(
+            {
+                "path": store.spark_url + path,
                 "format": "parquet",
             }
+        )
         for partition_col in self.partition_cols or []:
             partition_cols.append(partition_col)
         if partition_cols:
-            result["partitionBy"] = partition_cols
-        return result
+            spark_options["partitionBy"] = partition_cols
+        return spark_options
     def get_dask_options(self):
         return {"format": "parquet"}
@@ -1067,24 +1057,18 @@ class CSVTarget(BaseStoreTarget):
         )
     def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
-        if self.path and self.path.startswith("ds://"):
-            store, path = mlrun.store_manager.get_or_create_store(
-                self.get_target_path()
-            )
-            storage_spark_options = store.get_spark_options()
-            path = store.url + path
-            result = {
-                "path": store_path_to_spark(path, storage_spark_options),
-                "format": "csv",
-                "header": "true",
-            }
-            return {**result, **storage_spark_options}
-        else:
-            return {
-                "path": store_path_to_spark(self.get_target_path()),
+        store, path, url = mlrun.store_manager.get_or_create_store(
+            self.get_target_path()
+        )
+        spark_options = store.get_spark_options()
+        spark_options.update(
+            {
+                "path": store.spark_url + path,
                 "format": "csv",
                 "header": "true",
             }
+        )
+        return spark_options
     def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
         import pyspark.sql.functions as funcs
@@ -1209,7 +1193,11 @@ class NoSqlBaseTarget(BaseStoreTarget):
                 df = df.copy(deep=False)
             access_key = self._get_credential("V3IO_ACCESS_KEY")
-            _, path_with_container = parse_path(self.get_target_path())
+            store, target_path = self._get_store_and_path()
+            storage_options = store.get_storage_options()
+            access_key = storage_options.get("v3io_access_key", access_key)
+            _, path_with_container = parse_path(target_path)
             container, path = split_path(path_with_container)
             frames_client = get_frames_client(
@@ -1227,17 +1215,31 @@ class NoSqlTarget(NoSqlBaseTarget):
     def get_table_object(self):
         from storey import Table, V3ioDriver
-        # TODO use options/cred
-        endpoint, uri = parse_path(self.get_target_path())
+        store, target_path = self._get_store_and_path()
+        endpoint, uri = parse_path(target_path)
+        storage_options = store.get_storage_options()
+        access_key = storage_options.get("v3io_access_key")
         return Table(
             uri,
-            V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api),
+            V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api, access_key=access_key),
             flush_interval_secs=mlrun.mlconf.feature_store.flush_interval,
         )
     def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
+        store, target_path = self._get_store_and_path()
+        storage_options = store.get_storage_options()
+        store_access_key = storage_options.get("v3io_access_key")
+        env_access_key = self._secrets.get(
+            "V3IO_ACCESS_KEY", os.getenv("V3IO_ACCESS_KEY")
+        )
+        if store_access_key and env_access_key and store_access_key != env_access_key:
+            logger.warning(
+                "The Spark v3io connector does not support access_key parameterization."
+                "Spark will disregard the store-provided key."
+            )
         spark_options = {
-            "path": store_path_to_spark(self.get_target_path()),
+            "path": store.spark_url + target_path,
             "format": "io.iguaz.v3io.spark.sql.kv",
         }
         if isinstance(key_column, list) and len(key_column) >= 1:
@@ -1330,10 +1332,10 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
     def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
         endpoint, uri = self._get_server_endpoint()
         parsed_endpoint = urlparse(endpoint)
+        store, path = self._get_store_and_path()
         return {
             "key.column": "_spark_object_name",
-            "table": "{" + store_path_to_spark(self.get_target_path()),
+            "table": "{" + store.spark_url + path,
             "format": "org.apache.spark.sql.redis",
             "host": parsed_endpoint.hostname,
             "port": parsed_endpoint.port,
@@ -1381,10 +1383,12 @@ class StreamTarget(BaseStoreTarget):
         from storey import V3ioDriver
         key_columns = list(key_columns.keys())
-        path = self.get_target_path()
+        store, path = self._get_store_and_path()
         if not path:
             raise mlrun.errors.MLRunInvalidArgumentError("StreamTarget requires a path")
         endpoint, uri = parse_path(path)
+        storage_options = store.get_storage_options()
+        access_key = storage_options.get("v3io_access_key")
         column_list = self._get_column_list(
             features=features, timestamp_key=timestamp_key, key_columns=key_columns
         )
@@ -1395,7 +1399,9 @@ class StreamTarget(BaseStoreTarget):
             graph_shape="cylinder",
             class_name="storey.StreamTarget",
             columns=column_list,
-            storage=V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api),
+            storage=V3ioDriver(
+                webapi=endpoint or mlrun.mlconf.v3io_api, access_key=access_key
+            ),
             stream_path=uri,
             **self.attributes,
         )
@@ -1531,7 +1537,11 @@ class TSDBTarget(BaseStoreTarget):
                 key_column = [key_column]
             new_index.extend(key_column)
-        _, path_with_container = parse_path(self.get_target_path())
+        store, target_path = self._get_store_and_path()
+        storage_options = store.get_storage_options()
+        access_key = storage_options.get("v3io_access_key", access_key)
+        _, path_with_container = parse_path(target_path)
         container, path = split_path(path_with_container)
         frames_client = get_frames_client(

mlrun/datastore/utils.py CHANGED Viewed

@@ -15,7 +15,7 @@
 import tarfile
 import tempfile
 import typing
-from urllib.parse import parse_qs, urlparse, urlunparse
+from urllib.parse import parse_qs, urlparse
 import pandas as pd
 import semver
@@ -23,53 +23,6 @@ import semver
 import mlrun.datastore
-def store_path_to_spark(path, spark_options=None):
-    schemas = ["redis://", "rediss://", "ds://"]
-    if any(path.startswith(schema) for schema in schemas):
-        url = urlparse(path)
-        if url.path:
-            path = url.path
-    elif path.startswith("gcs://"):
-        path = "gs:" + path[len("gcs:") :]
-    elif path.startswith("v3io:///"):
-        path = "v3io:" + path[len("v3io:/") :]
-    elif path.startswith("az://"):
-        account_key = None
-        path = "wasbs:" + path[len("az:") :]
-        prefix = "spark.hadoop.fs.azure.account.key."
-        if spark_options:
-            for key in spark_options:
-                if key.startswith(prefix):
-                    account_key = key[len(prefix) :]
-                    break
-        if account_key:
-            # transfer "wasb://basket/some/path" to wasb://basket@account_key.blob.core.windows.net/some/path
-            parsed_url = urlparse(path)
-            new_netloc = f"{parsed_url.hostname}@{account_key}"
-            path = urlunparse(
-                (
-                    parsed_url.scheme,
-                    new_netloc,
-                    parsed_url.path,
-                    parsed_url.params,
-                    parsed_url.query,
-                    parsed_url.fragment,
-                )
-            )
-    elif path.startswith("s3://"):
-        if path.startswith("s3:///"):
-            # 's3:///' not supported since mlrun 0.9.0 should use s3:// instead
-            from mlrun.errors import MLRunInvalidArgumentError
-            valid_path = "s3:" + path[len("s3:/") :]
-            raise MLRunInvalidArgumentError(
-                f"'s3:///' is not supported, try using 's3://' instead.\nE.g: '{valid_path}'"
-            )
-        else:
-            path = "s3a:" + path[len("s3:") :]
-    return path
 def parse_kafka_url(url: str, bootstrap_servers: list = None) -> tuple[str, list]:
     """Generating Kafka topic and adjusting a list of bootstrap servers.
@@ -105,7 +58,7 @@ def upload_tarball(source_dir, target, secrets=None):
         with tarfile.open(mode="w:gz", fileobj=temp_fh) as tar:
             tar.add(source_dir, arcname="")
         stores = mlrun.datastore.store_manager.set(secrets)
-        datastore, subpath = stores.get_or_create_store(target)
+        datastore, subpath, url = stores.get_or_create_store(target)
         datastore.upload(subpath, temp_fh.name)

mlrun/datastore/v3io.py CHANGED Viewed

@@ -79,6 +79,10 @@ class V3ioStore(DataStore):
         schema = "https" if self.secure else "http"
         return f"{schema}://{self.endpoint}"
+    @property
+    def spark_url(self):
+        return "v3io:/"
     @property
     def filesystem(self):
         """return fsspec file system object, if supported"""

mlrun/db/base.py CHANGED Viewed

@@ -676,3 +676,37 @@ class RunDBInterface(ABC):
         self, func_url: str = None, function: "mlrun.runtimes.BaseRuntime" = None
     ):
         pass
+    def submit_workflow(
+        self,
+        project: str,
+        name: str,
+        workflow_spec: Union[
+            "mlrun.projects.pipelines.WorkflowSpec",
+            "mlrun.common.schemas.WorkflowSpec",
+            dict,
+        ],
+        arguments: Optional[dict] = None,
+        artifact_path: Optional[str] = None,
+        source: Optional[str] = None,
+        run_name: Optional[str] = None,
+        namespace: Optional[str] = None,
+        notifications: list["mlrun.model.Notification"] = None,
+    ) -> "mlrun.common.schemas.WorkflowResponse":
+        pass
+    def update_model_monitoring_controller(
+        self,
+        project: str,
+        base_period: int = 10,
+        image: str = "mlrun/mlrun",
+    ):
+        pass
+    def enable_model_monitoring(
+        self,
+        project: str,
+        base_period: int = 10,
+        image: str = "mlrun/mlrun",
+    ):
+        pass

mlrun 1.7.0rc3__py3-none-any.whl → 1.7.0rc4__py3-none-any.whl

Potentially problematic release.

mlrun 1.7.0rc3py3-none-any.whl → 1.7.0rc4py3-none-any.whl