PyPI - mlrun - Versions diffs - 1.7.0rc3__py3-none-any.whl → 1.7.0rc5__py3-none-any.whl - Mend

mlrun 1.7.0rc3py3-none-any.whl → 1.7.0rc5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (76) hide show

mlrun/artifacts/manager.py +6 -1
mlrun/common/constants.py +2 -0
mlrun/common/model_monitoring/helpers.py +12 -6
mlrun/common/schemas/__init__.py +11 -0
mlrun/common/schemas/api_gateway.py +85 -0
mlrun/common/schemas/auth.py +2 -2
mlrun/common/schemas/client_spec.py +1 -0
mlrun/common/schemas/common.py +40 -0
mlrun/common/schemas/model_monitoring/constants.py +4 -1
mlrun/common/schemas/project.py +2 -0
mlrun/config.py +31 -17
mlrun/datastore/azure_blob.py +22 -9
mlrun/datastore/base.py +15 -25
mlrun/datastore/datastore.py +19 -8
mlrun/datastore/datastore_profile.py +47 -5
mlrun/datastore/google_cloud_storage.py +10 -6
mlrun/datastore/hdfs.py +51 -0
mlrun/datastore/redis.py +4 -0
mlrun/datastore/s3.py +4 -0
mlrun/datastore/sources.py +29 -43
mlrun/datastore/targets.py +59 -53
mlrun/datastore/utils.py +2 -49
mlrun/datastore/v3io.py +4 -0
mlrun/db/base.py +50 -0
mlrun/db/httpdb.py +121 -50
mlrun/db/nopdb.py +13 -0
mlrun/execution.py +3 -3
mlrun/feature_store/feature_vector.py +2 -2
mlrun/frameworks/tf_keras/callbacks/logging_callback.py +3 -3
mlrun/frameworks/tf_keras/model_handler.py +7 -7
mlrun/k8s_utils.py +10 -5
mlrun/kfpops.py +19 -10
mlrun/model.py +5 -0
mlrun/model_monitoring/api.py +3 -3
mlrun/model_monitoring/application.py +1 -1
mlrun/model_monitoring/applications/__init__.py +13 -0
mlrun/model_monitoring/applications/histogram_data_drift.py +218 -0
mlrun/model_monitoring/batch.py +9 -111
mlrun/model_monitoring/controller.py +73 -55
mlrun/model_monitoring/controller_handler.py +13 -5
mlrun/model_monitoring/features_drift_table.py +62 -53
mlrun/model_monitoring/helpers.py +30 -21
mlrun/model_monitoring/metrics/__init__.py +13 -0
mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
mlrun/model_monitoring/stores/kv_model_endpoint_store.py +14 -14
mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -1
mlrun/package/packagers/pandas_packagers.py +3 -3
mlrun/package/utils/_archiver.py +3 -1
mlrun/platforms/iguazio.py +8 -65
mlrun/projects/pipelines.py +21 -11
mlrun/projects/project.py +180 -42
mlrun/run.py +1 -1
mlrun/runtimes/base.py +25 -2
mlrun/runtimes/kubejob.py +5 -3
mlrun/runtimes/local.py +2 -2
mlrun/runtimes/mpijob/abstract.py +6 -6
mlrun/runtimes/nuclio/__init__.py +1 -0
mlrun/runtimes/nuclio/api_gateway.py +300 -0
mlrun/runtimes/nuclio/function.py +9 -9
mlrun/runtimes/nuclio/serving.py +3 -3
mlrun/runtimes/pod.py +3 -3
mlrun/runtimes/sparkjob/spark3job.py +3 -3
mlrun/serving/remote.py +4 -2
mlrun/serving/server.py +2 -8
mlrun/utils/async_http.py +3 -3
mlrun/utils/helpers.py +27 -5
mlrun/utils/http.py +3 -3
mlrun/utils/logger.py +2 -2
mlrun/utils/notifications/notification_pusher.py +6 -6
mlrun/utils/version/version.json +2 -2
{mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/METADATA +13 -16
{mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/RECORD +76 -68
{mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/WHEEL +1 -1
{mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/LICENSE +0 -0
{mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/entry_points.txt +0 -0
{mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/top_level.txt +0 -0

mlrun/datastore/datastore_profile.py CHANGED Viewed

@@ -132,6 +132,22 @@ class DatastoreProfileKafkaSource(DatastoreProfile):
         return attributes
+class DatastoreProfileV3io(DatastoreProfile):
+    type: str = pydantic.Field("v3io")
+    v3io_access_key: typing.Optional[str] = None
+    _private_attributes = "v3io_access_key"
+    def url(self, subpath):
+        subpath = subpath.lstrip("/")
+        return f"v3io:///{subpath}"
+    def secrets(self) -> dict:
+        res = {}
+        if self.v3io_access_key:
+            res["V3IO_ACCESS_KEY"] = self.v3io_access_key
+        return res
 class DatastoreProfileS3(DatastoreProfile):
     type: str = pydantic.Field("s3")
     _private_attributes = ("access_key_id", "secret_key")
@@ -156,7 +172,7 @@ class DatastoreProfileS3(DatastoreProfile):
             res["AWS_PROFILE"] = self.profile_name
         if self.assume_role_arn:
             res["MLRUN_AWS_ROLE_ARN"] = self.assume_role_arn
-        return res if res else None
+        return res
     def url(self, subpath):
         return f"s3:/{subpath}"
@@ -199,7 +215,7 @@ class DatastoreProfileRedis(DatastoreProfile):
             res["REDIS_USER"] = self.username
         if self.password:
             res["REDIS_PASSWORD"] = self.password
-        return res if res else None
+        return res
     def url(self, subpath):
         return self.endpoint_url + subpath
@@ -220,7 +236,7 @@ class DatastoreProfileDBFS(DatastoreProfile):
             res["DATABRICKS_TOKEN"] = self.token
         if self.endpoint_url:
             res["DATABRICKS_HOST"] = self.endpoint_url
-        return res if res else None
+        return res
 class DatastoreProfileGCS(DatastoreProfile):
@@ -247,7 +263,7 @@ class DatastoreProfileGCS(DatastoreProfile):
             res["GOOGLE_APPLICATION_CREDENTIALS"] = self.credentials_path
         if self.gcp_credentials:
             res["GCP_CREDENTIALS"] = self.gcp_credentials
-        return res if res else None
+        return res
 class DatastoreProfileAzureBlob(DatastoreProfile):
@@ -292,7 +308,31 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
             res["sas_token"] = self.sas_token
         if self.credential:
             res["credential"] = self.credential
-        return res if res else None
+        return res
+class DatastoreProfileHdfs(DatastoreProfile):
+    type: str = pydantic.Field("hdfs")
+    _private_attributes = "token"
+    host: typing.Optional[str] = None
+    port: typing.Optional[int] = None
+    http_port: typing.Optional[int] = None
+    user: typing.Optional[str] = None
+    def secrets(self) -> dict:
+        res = {}
+        if self.host:
+            res["HDFS_HOST"] = self.host
+        if self.port:
+            res["HDFS_PORT"] = self.port
+        if self.port:
+            res["HDFS_HTTP_PORT"] = self.http_port
+        if self.user:
+            res["HDFS_USER"] = self.user
+        return res or None
+    def url(self, subpath):
+        return f"hdfs://{self.host}:{self.http_port}{subpath}"
 class DatastoreProfile2Json(pydantic.BaseModel):
@@ -346,6 +386,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
         decoded_dict = {k: safe_literal_eval(v) for k, v in decoded_dict.items()}
         datastore_type = decoded_dict.get("type")
         ds_profile_factory = {
+            "v3io": DatastoreProfileV3io,
             "s3": DatastoreProfileS3,
             "redis": DatastoreProfileRedis,
             "basic": DatastoreProfileBasic,
@@ -354,6 +395,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
             "dbfs": DatastoreProfileDBFS,
             "gcs": DatastoreProfileGCS,
             "az": DatastoreProfileAzureBlob,
+            "hdfs": DatastoreProfileHdfs,
         }
         if datastore_type in ds_profile_factory:
             return ds_profile_factory[datastore_type].parse_obj(decoded_dict)

mlrun/datastore/google_cloud_storage.py CHANGED Viewed

@@ -147,13 +147,13 @@ class GoogleCloudStorageStore(DataStore):
             if "project_id" in credentials:
                 res["spark.hadoop.fs.gs.project.id"] = credentials["project_id"]
             if "private_key_id" in credentials:
-                res[
-                    "spark.hadoop.fs.gs.auth.service.account.private.key.id"
-                ] = credentials["private_key_id"]
+                res["spark.hadoop.fs.gs.auth.service.account.private.key.id"] = (
+                    credentials["private_key_id"]
+                )
             if "private_key" in credentials:
-                res[
-                    "spark.hadoop.fs.gs.auth.service.account.private.key"
-                ] = credentials["private_key"]
+                res["spark.hadoop.fs.gs.auth.service.account.private.key"] = (
+                    credentials["private_key"]
+                )
             if "client_email" in credentials:
                 res["spark.hadoop.fs.gs.auth.service.account.email"] = credentials[
                     "client_email"
@@ -161,3 +161,7 @@ class GoogleCloudStorageStore(DataStore):
             if "client_id" in credentials:
                 res["spark.hadoop.fs.gs.client.id"] = credentials["client_id"]
         return res
+    @property
+    def spark_url(self):
+        return f"gs://{self.endpoint}"

mlrun/datastore/hdfs.py ADDED Viewed

@@ -0,0 +1,51 @@
+# Copyright 2024 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import fsspec
+from mlrun.datastore.base import DataStore
+class HdfsStore(DataStore):
+    def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
+        super().__init__(parent, name, schema, endpoint, secrets)
+        self.host = self._get_secret_or_env("HDFS_HOST")
+        self.port = self._get_secret_or_env("HDFS_PORT")
+        self.http_port = self._get_secret_or_env("HDFS_HTTP_PORT")
+        self.user = self._get_secret_or_env("HDFS_USER")
+        if not self.user:
+            self.user = os.environ.get("HADOOP_USER_NAME", os.environ.get("USER"))
+        self._filesystem = None
+    @property
+    def filesystem(self):
+        if not self._filesystem:
+            self._filesystem = fsspec.filesystem(
+                "webhdfs",
+                host=self.host,
+                port=self.http_port,
+                user=self.user,
+            )
+        return self._filesystem
+    @property
+    def url(self):
+        return f"webhdfs://{self.host}:{self.http_port}"
+    @property
+    def spark_url(self):
+        return f"hdfs://{self.host}:{self.port}"

mlrun/datastore/redis.py CHANGED Viewed

@@ -163,3 +163,7 @@ class RedisStore(DataStore):
                 self.redis.delete(k)
         else:
             self.redis.delete(key)
+    @property
+    def spark_url(self):
+        return ""

mlrun/datastore/s3.py CHANGED Viewed

@@ -156,6 +156,10 @@ class S3Store(DataStore):
         return self._sanitize_storage_options(storage_options)
+    @property
+    def spark_url(self):
+        return f"s3a://{self.endpoint}"
     def get_bucket_and_key(self, key):
         path = self._join(key)[1:]
         return self.endpoint, path

mlrun/datastore/sources.py CHANGED Viewed

@@ -39,7 +39,6 @@ from .utils import (
     _generate_sql_query_with_time_filter,
     filter_df_start_end_time,
     select_columns_from_df,
-    store_path_to_spark,
 )
@@ -193,14 +192,10 @@ class CSVSource(BaseSourceDriver):
             parse_dates.append(time_field)
         data_item = mlrun.store_manager.object(self.path)
-        if self.path and self.path.startswith("ds://"):
-            store, path = mlrun.store_manager.get_or_create_store(self.path)
-            path = store.url + path
-        else:
-            path = data_item.url
+        store, path, url = mlrun.store_manager.get_or_create_store(self.path)
         return storey.CSVSource(
-            paths=path,  # unlike self.path, it already has store:// replaced
+            paths=url,  # unlike self.path, it already has store:// replaced
             build_dict=True,
             key_field=self.key_field or key_field,
             storage_options=data_item.store.get_storage_options(),
@@ -209,25 +204,17 @@ class CSVSource(BaseSourceDriver):
         )
     def get_spark_options(self):
-        if self.path and self.path.startswith("ds://"):
-            store, path = mlrun.store_manager.get_or_create_store(self.path)
-            storage_spark_options = store.get_spark_options()
-            path = store.url + path
-            result = {
-                "path": store_path_to_spark(path, storage_spark_options),
-                "format": "csv",
-                "header": "true",
-                "inferSchema": "true",
-            }
-            return {**result, **storage_spark_options}
-        else:
-            return {
-                "path": store_path_to_spark(self.path),
+        store, path, url = mlrun.store_manager.get_or_create_store(self.path)
+        spark_options = store.get_spark_options()
+        spark_options.update(
+            {
+                "path": url,
                 "format": "csv",
                 "header": "true",
                 "inferSchema": "true",
             }
+        )
+        return spark_options
     def to_spark_df(self, session, named_view=False, time_field=None, columns=None):
         import pyspark.sql.functions as funcs
@@ -357,14 +344,10 @@ class ParquetSource(BaseSourceDriver):
             attributes["context"] = context
         data_item = mlrun.store_manager.object(self.path)
-        if self.path and self.path.startswith("ds://"):
-            store, path = mlrun.store_manager.get_or_create_store(self.path)
-            path = store.url + path
-        else:
-            path = data_item.url
+        store, path, url = mlrun.store_manager.get_or_create_store(self.path)
         return storey.ParquetSource(
-            paths=path,  # unlike self.path, it already has store:// replaced
+            paths=url,  # unlike self.path, it already has store:// replaced
             key_field=self.key_field or key_field,
             storage_options=data_item.store.get_storage_options(),
             end_filter=self.end_time,
@@ -374,20 +357,15 @@ class ParquetSource(BaseSourceDriver):
         )
     def get_spark_options(self):
-        if self.path and self.path.startswith("ds://"):
-            store, path = mlrun.store_manager.get_or_create_store(self.path)
-            storage_spark_options = store.get_spark_options()
-            path = store.url + path
-            result = {
-                "path": store_path_to_spark(path, storage_spark_options),
-                "format": "parquet",
-            }
-            return {**result, **storage_spark_options}
-        else:
-            return {
-                "path": store_path_to_spark(self.path),
+        store, path, url = mlrun.store_manager.get_or_create_store(self.path)
+        spark_options = store.get_spark_options()
+        spark_options.update(
+            {
+                "path": store.spark_url + path,
                 "format": "parquet",
             }
+        )
+        return spark_options
     def to_dataframe(
         self,
@@ -875,8 +853,16 @@ class StreamSource(OnlineSource):
         super().__init__(name, attributes=attrs, **kwargs)
     def add_nuclio_trigger(self, function):
-        endpoint, stream_path = parse_path(self.path)
-        v3io_client = v3io.dataplane.Client(endpoint=endpoint)
+        store, path, url = mlrun.store_manager.get_or_create_store(self.path)
+        if store.kind != "v3io":
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                "Only profiles that reference the v3io datastore can be used with StreamSource"
+            )
+        path = "v3io:/" + path
+        storage_options = store.get_storage_options()
+        access_key = storage_options.get("v3io_access_key")
+        endpoint, stream_path = parse_path(url)
+        v3io_client = v3io.dataplane.Client(endpoint=endpoint, access_key=access_key)
         container, stream_path = split_path(stream_path)
         res = v3io_client.stream.create(
             container=container,
@@ -896,7 +882,7 @@ class StreamSource(OnlineSource):
             kwargs["worker_allocation_mode"] = "static"
         function.add_v3io_stream_trigger(
-            self.path,
+            path,
             self.name,
             self.attributes["group"],
             self.attributes["seek_to"],

mlrun/datastore/targets.py CHANGED Viewed

@@ -29,7 +29,7 @@ import mlrun
 import mlrun.utils.helpers
 from mlrun.config import config
 from mlrun.model import DataSource, DataTarget, DataTargetBase, TargetPathObject
-from mlrun.utils import now_date
+from mlrun.utils import logger, now_date
 from mlrun.utils.helpers import to_parquet
 from mlrun.utils.v3io_clients import get_frames_client
@@ -43,7 +43,6 @@ from .utils import (
     filter_df_start_end_time,
     parse_kafka_url,
     select_columns_from_df,
-    store_path_to_spark,
 )
@@ -448,14 +447,11 @@ class BaseStoreTarget(DataTargetBase):
             if self.credentials_prefix
             else None
         )
-        store, resolved_store_path = mlrun.store_manager.get_or_create_store(
+        store, resolved_store_path, url = mlrun.store_manager.get_or_create_store(
             self.get_target_path(),
             credentials_prefix_secrets,
         )
-        if self.get_target_path() and self.get_target_path().startswith("ds://"):
-            return store, store.url + resolved_store_path
-        else:
-            return store, self.get_target_path()
+        return store, resolved_store_path, url
     def _get_column_list(self, features, timestamp_key, key_columns, with_type=False):
         result = []
@@ -504,7 +500,7 @@ class BaseStoreTarget(DataTargetBase):
             write_spark_dataframe_with_options(options, df, "overwrite")
         elif hasattr(df, "dask"):
             dask_options = self.get_dask_options()
-            store, target_path = self._get_store_and_path()
+            store, path_in_store, target_path = self._get_store_and_path()
             storage_options = store.get_storage_options()
             df = df.repartition(partition_size="100MB")
             try:
@@ -525,7 +521,7 @@ class BaseStoreTarget(DataTargetBase):
             except Exception as exc:
                 raise RuntimeError("Failed to write Dask Dataframe") from exc
         else:
-            store, target_path = self._get_store_and_path()
+            store, path_in_store, target_path = self._get_store_and_path()
             target_path = generate_path_with_chunk(self, chunk_id, target_path)
             file_system = store.filesystem
             if file_system.protocol == "file":
@@ -692,7 +688,7 @@ class BaseStoreTarget(DataTargetBase):
         raise NotImplementedError()
     def purge(self):
-        store, target_path = self._get_store_and_path()
+        store, path_in_store, target_path = self._get_store_and_path()
         store.rm(target_path, recursive=True)
     def as_df(
@@ -872,7 +868,7 @@ class ParquetTarget(BaseStoreTarget):
         for key_column in key_columns:
             tuple_key_columns.append((key_column.name, key_column.value_type))
-        store, target_path = self._get_store_and_path()
+        store, path_in_store, target_path = self._get_store_and_path()
         storage_options = store.get_storage_options()
         if storage_options and self.storage_options:
@@ -925,27 +921,19 @@ class ParquetTarget(BaseStoreTarget):
                     if unit == time_partitioning_granularity:
                         break
-        if self.path and self.path.startswith("ds://"):
-            store, path = mlrun.store_manager.get_or_create_store(
-                self.get_target_path()
-            )
-            storage_spark_options = store.get_spark_options()
-            path = store.url + path
-            result = {
-                "path": store_path_to_spark(path, storage_spark_options),
-                "format": "parquet",
-            }
-            result = {**result, **storage_spark_options}
-        else:
-            result = {
-                "path": store_path_to_spark(self.get_target_path()),
+        store, path, url = self._get_store_and_path()
+        spark_options = store.get_spark_options()
+        spark_options.update(
+            {
+                "path": store.spark_url + path,
                 "format": "parquet",
             }
+        )
         for partition_col in self.partition_cols or []:
             partition_cols.append(partition_col)
         if partition_cols:
-            result["partitionBy"] = partition_cols
-        return result
+            spark_options["partitionBy"] = partition_cols
+        return spark_options
     def get_dask_options(self):
         return {"format": "parquet"}
@@ -1052,7 +1040,7 @@ class CSVTarget(BaseStoreTarget):
         column_list = self._get_column_list(
             features=features, timestamp_key=timestamp_key, key_columns=key_columns
         )
-        store, target_path = self._get_store_and_path()
+        store, path_in_store, target_path = self._get_store_and_path()
         graph.add_step(
             name=self.name or "CSVTarget",
             after=after,
@@ -1067,24 +1055,16 @@ class CSVTarget(BaseStoreTarget):
         )
     def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
-        if self.path and self.path.startswith("ds://"):
-            store, path = mlrun.store_manager.get_or_create_store(
-                self.get_target_path()
-            )
-            storage_spark_options = store.get_spark_options()
-            path = store.url + path
-            result = {
-                "path": store_path_to_spark(path, storage_spark_options),
-                "format": "csv",
-                "header": "true",
-            }
-            return {**result, **storage_spark_options}
-        else:
-            return {
-                "path": store_path_to_spark(self.get_target_path()),
+        store, path, url = self._get_store_and_path()
+        spark_options = store.get_spark_options()
+        spark_options.update(
+            {
+                "path": store.spark_url + path,
                 "format": "csv",
                 "header": "true",
             }
+        )
+        return spark_options
     def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
         import pyspark.sql.functions as funcs
@@ -1209,7 +1189,11 @@ class NoSqlBaseTarget(BaseStoreTarget):
                 df = df.copy(deep=False)
             access_key = self._get_credential("V3IO_ACCESS_KEY")
-            _, path_with_container = parse_path(self.get_target_path())
+            store, path_in_store, target_path = self._get_store_and_path()
+            storage_options = store.get_storage_options()
+            access_key = storage_options.get("v3io_access_key", access_key)
+            _, path_with_container = parse_path(target_path)
             container, path = split_path(path_with_container)
             frames_client = get_frames_client(
@@ -1227,17 +1211,31 @@ class NoSqlTarget(NoSqlBaseTarget):
     def get_table_object(self):
         from storey import Table, V3ioDriver
-        # TODO use options/cred
-        endpoint, uri = parse_path(self.get_target_path())
+        store, path_in_store, target_path = self._get_store_and_path()
+        endpoint, uri = parse_path(target_path)
+        storage_options = store.get_storage_options()
+        access_key = storage_options.get("v3io_access_key")
         return Table(
             uri,
-            V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api),
+            V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api, access_key=access_key),
             flush_interval_secs=mlrun.mlconf.feature_store.flush_interval,
         )
     def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
+        store, path_in_store, target_path = self._get_store_and_path()
+        storage_options = store.get_storage_options()
+        store_access_key = storage_options.get("v3io_access_key")
+        env_access_key = self._secrets.get(
+            "V3IO_ACCESS_KEY", os.getenv("V3IO_ACCESS_KEY")
+        )
+        if store_access_key and env_access_key and store_access_key != env_access_key:
+            logger.warning(
+                "The Spark v3io connector does not support access_key parameterization."
+                "Spark will disregard the store-provided key."
+            )
         spark_options = {
-            "path": store_path_to_spark(self.get_target_path()),
+            "path": store.spark_url + path_in_store,
             "format": "io.iguaz.v3io.spark.sql.kv",
         }
         if isinstance(key_column, list) and len(key_column) >= 1:
@@ -1330,10 +1328,10 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
     def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
         endpoint, uri = self._get_server_endpoint()
         parsed_endpoint = urlparse(endpoint)
+        store, path_in_store, path = self._get_store_and_path()
         return {
             "key.column": "_spark_object_name",
-            "table": "{" + store_path_to_spark(self.get_target_path()),
+            "table": "{" + path_in_store,
             "format": "org.apache.spark.sql.redis",
             "host": parsed_endpoint.hostname,
             "port": parsed_endpoint.port,
@@ -1381,10 +1379,12 @@ class StreamTarget(BaseStoreTarget):
         from storey import V3ioDriver
         key_columns = list(key_columns.keys())
-        path = self.get_target_path()
+        store, path_in_store, path = self._get_store_and_path()
         if not path:
             raise mlrun.errors.MLRunInvalidArgumentError("StreamTarget requires a path")
         endpoint, uri = parse_path(path)
+        storage_options = store.get_storage_options()
+        access_key = storage_options.get("v3io_access_key")
         column_list = self._get_column_list(
             features=features, timestamp_key=timestamp_key, key_columns=key_columns
         )
@@ -1395,7 +1395,9 @@ class StreamTarget(BaseStoreTarget):
             graph_shape="cylinder",
             class_name="storey.StreamTarget",
             columns=column_list,
-            storage=V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api),
+            storage=V3ioDriver(
+                webapi=endpoint or mlrun.mlconf.v3io_api, access_key=access_key
+            ),
             stream_path=uri,
             **self.attributes,
         )
@@ -1531,7 +1533,11 @@ class TSDBTarget(BaseStoreTarget):
                 key_column = [key_column]
             new_index.extend(key_column)
-        _, path_with_container = parse_path(self.get_target_path())
+        store, path_in_store, target_path = self._get_store_and_path()
+        storage_options = store.get_storage_options()
+        access_key = storage_options.get("v3io_access_key", access_key)
+        _, path_with_container = parse_path(target_path)
         container, path = split_path(path_with_container)
         frames_client = get_frames_client(

mlrun/datastore/utils.py CHANGED Viewed

@@ -15,7 +15,7 @@
 import tarfile
 import tempfile
 import typing
-from urllib.parse import parse_qs, urlparse, urlunparse
+from urllib.parse import parse_qs, urlparse
 import pandas as pd
 import semver
@@ -23,53 +23,6 @@ import semver
 import mlrun.datastore
-def store_path_to_spark(path, spark_options=None):
-    schemas = ["redis://", "rediss://", "ds://"]
-    if any(path.startswith(schema) for schema in schemas):
-        url = urlparse(path)
-        if url.path:
-            path = url.path
-    elif path.startswith("gcs://"):
-        path = "gs:" + path[len("gcs:") :]
-    elif path.startswith("v3io:///"):
-        path = "v3io:" + path[len("v3io:/") :]
-    elif path.startswith("az://"):
-        account_key = None
-        path = "wasbs:" + path[len("az:") :]
-        prefix = "spark.hadoop.fs.azure.account.key."
-        if spark_options:
-            for key in spark_options:
-                if key.startswith(prefix):
-                    account_key = key[len(prefix) :]
-                    break
-        if account_key:
-            # transfer "wasb://basket/some/path" to wasb://basket@account_key.blob.core.windows.net/some/path
-            parsed_url = urlparse(path)
-            new_netloc = f"{parsed_url.hostname}@{account_key}"
-            path = urlunparse(
-                (
-                    parsed_url.scheme,
-                    new_netloc,
-                    parsed_url.path,
-                    parsed_url.params,
-                    parsed_url.query,
-                    parsed_url.fragment,
-                )
-            )
-    elif path.startswith("s3://"):
-        if path.startswith("s3:///"):
-            # 's3:///' not supported since mlrun 0.9.0 should use s3:// instead
-            from mlrun.errors import MLRunInvalidArgumentError
-            valid_path = "s3:" + path[len("s3:/") :]
-            raise MLRunInvalidArgumentError(
-                f"'s3:///' is not supported, try using 's3://' instead.\nE.g: '{valid_path}'"
-            )
-        else:
-            path = "s3a:" + path[len("s3:") :]
-    return path
 def parse_kafka_url(url: str, bootstrap_servers: list = None) -> tuple[str, list]:
     """Generating Kafka topic and adjusting a list of bootstrap servers.
@@ -105,7 +58,7 @@ def upload_tarball(source_dir, target, secrets=None):
         with tarfile.open(mode="w:gz", fileobj=temp_fh) as tar:
             tar.add(source_dir, arcname="")
         stores = mlrun.datastore.store_manager.set(secrets)
-        datastore, subpath = stores.get_or_create_store(target)
+        datastore, subpath, url = stores.get_or_create_store(target)
         datastore.upload(subpath, temp_fh.name)

mlrun/datastore/v3io.py CHANGED Viewed

@@ -79,6 +79,10 @@ class V3ioStore(DataStore):
         schema = "https" if self.secure else "http"
         return f"{schema}://{self.endpoint}"
+    @property
+    def spark_url(self):
+        return "v3io:/"
     @property
     def filesystem(self):
         """return fsspec file system object, if supported"""

mlrun 1.7.0rc3__py3-none-any.whl → 1.7.0rc5__py3-none-any.whl

Potentially problematic release.

mlrun 1.7.0rc3py3-none-any.whl → 1.7.0rc5py3-none-any.whl