PyPI - mlrun - Versions diffs - 1.7.0rc28__py3-none-any.whl → 1.7.0rc55__py3-none-any.whl - Mend

mlrun 1.7.0rc28py3-none-any.whl → 1.7.0rc55py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (135) hide show

mlrun/__main__.py +4 -2
mlrun/alerts/alert.py +75 -8
mlrun/artifacts/base.py +1 -0
mlrun/artifacts/manager.py +9 -2
mlrun/common/constants.py +4 -1
mlrun/common/db/sql_session.py +3 -2
mlrun/common/formatters/__init__.py +1 -0
mlrun/common/formatters/artifact.py +1 -0
mlrun/{model_monitoring/application.py → common/formatters/feature_set.py} +20 -6
mlrun/common/formatters/run.py +3 -0
mlrun/common/helpers.py +0 -1
mlrun/common/schemas/__init__.py +3 -1
mlrun/common/schemas/alert.py +15 -12
mlrun/common/schemas/api_gateway.py +6 -6
mlrun/common/schemas/auth.py +5 -0
mlrun/common/schemas/client_spec.py +0 -1
mlrun/common/schemas/common.py +7 -4
mlrun/common/schemas/frontend_spec.py +7 -0
mlrun/common/schemas/function.py +7 -0
mlrun/common/schemas/model_monitoring/__init__.py +4 -3
mlrun/common/schemas/model_monitoring/constants.py +41 -26
mlrun/common/schemas/model_monitoring/model_endpoints.py +23 -47
mlrun/common/schemas/notification.py +69 -12
mlrun/common/schemas/project.py +45 -12
mlrun/common/schemas/workflow.py +10 -2
mlrun/common/types.py +1 -0
mlrun/config.py +91 -35
mlrun/data_types/data_types.py +6 -1
mlrun/data_types/spark.py +2 -2
mlrun/data_types/to_pandas.py +57 -25
mlrun/datastore/__init__.py +1 -0
mlrun/datastore/alibaba_oss.py +3 -2
mlrun/datastore/azure_blob.py +125 -37
mlrun/datastore/base.py +42 -21
mlrun/datastore/datastore.py +4 -2
mlrun/datastore/datastore_profile.py +1 -1
mlrun/datastore/dbfs_store.py +3 -7
mlrun/datastore/filestore.py +1 -3
mlrun/datastore/google_cloud_storage.py +85 -29
mlrun/datastore/inmem.py +4 -1
mlrun/datastore/redis.py +1 -0
mlrun/datastore/s3.py +25 -12
mlrun/datastore/sources.py +76 -4
mlrun/datastore/spark_utils.py +30 -0
mlrun/datastore/storeytargets.py +151 -0
mlrun/datastore/targets.py +102 -131
mlrun/datastore/v3io.py +1 -0
mlrun/db/base.py +15 -6
mlrun/db/httpdb.py +57 -28
mlrun/db/nopdb.py +29 -5
mlrun/errors.py +20 -3
mlrun/execution.py +46 -5
mlrun/feature_store/api.py +25 -1
mlrun/feature_store/common.py +6 -11
mlrun/feature_store/feature_vector.py +3 -1
mlrun/feature_store/retrieval/job.py +4 -1
mlrun/feature_store/retrieval/spark_merger.py +10 -39
mlrun/feature_store/steps.py +8 -0
mlrun/frameworks/_common/plan.py +3 -3
mlrun/frameworks/_ml_common/plan.py +1 -1
mlrun/frameworks/parallel_coordinates.py +2 -3
mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
mlrun/k8s_utils.py +48 -2
mlrun/launcher/client.py +6 -6
mlrun/launcher/local.py +2 -2
mlrun/model.py +215 -34
mlrun/model_monitoring/api.py +38 -24
mlrun/model_monitoring/applications/__init__.py +1 -2
mlrun/model_monitoring/applications/_application_steps.py +60 -29
mlrun/model_monitoring/applications/base.py +2 -174
mlrun/model_monitoring/applications/context.py +197 -70
mlrun/model_monitoring/applications/evidently_base.py +11 -85
mlrun/model_monitoring/applications/histogram_data_drift.py +21 -16
mlrun/model_monitoring/applications/results.py +4 -4
mlrun/model_monitoring/controller.py +110 -282
mlrun/model_monitoring/db/stores/__init__.py +8 -3
mlrun/model_monitoring/db/stores/base/store.py +3 -0
mlrun/model_monitoring/db/stores/sqldb/models/base.py +9 -7
mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +18 -3
mlrun/model_monitoring/db/stores/sqldb/sql_store.py +43 -23
mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +48 -35
mlrun/model_monitoring/db/tsdb/__init__.py +7 -2
mlrun/model_monitoring/db/tsdb/base.py +147 -15
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +94 -55
mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -3
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +144 -38
mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +44 -3
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +246 -57
mlrun/model_monitoring/helpers.py +70 -50
mlrun/model_monitoring/stream_processing.py +96 -195
mlrun/model_monitoring/writer.py +13 -5
mlrun/package/packagers/default_packager.py +2 -2
mlrun/projects/operations.py +16 -8
mlrun/projects/pipelines.py +126 -115
mlrun/projects/project.py +286 -129
mlrun/render.py +3 -3
mlrun/run.py +38 -19
mlrun/runtimes/__init__.py +19 -8
mlrun/runtimes/base.py +4 -1
mlrun/runtimes/daskjob.py +1 -1
mlrun/runtimes/funcdoc.py +1 -1
mlrun/runtimes/kubejob.py +6 -6
mlrun/runtimes/local.py +12 -5
mlrun/runtimes/nuclio/api_gateway.py +68 -8
mlrun/runtimes/nuclio/application/application.py +307 -70
mlrun/runtimes/nuclio/function.py +63 -14
mlrun/runtimes/nuclio/serving.py +10 -10
mlrun/runtimes/pod.py +25 -19
mlrun/runtimes/remotesparkjob.py +2 -5
mlrun/runtimes/sparkjob/spark3job.py +16 -17
mlrun/runtimes/utils.py +34 -0
mlrun/serving/routers.py +2 -5
mlrun/serving/server.py +37 -19
mlrun/serving/states.py +30 -3
mlrun/serving/v2_serving.py +44 -35
mlrun/track/trackers/mlflow_tracker.py +5 -0
mlrun/utils/async_http.py +1 -1
mlrun/utils/db.py +18 -0
mlrun/utils/helpers.py +150 -36
mlrun/utils/http.py +1 -1
mlrun/utils/notifications/notification/__init__.py +0 -1
mlrun/utils/notifications/notification/webhook.py +8 -1
mlrun/utils/notifications/notification_pusher.py +1 -1
mlrun/utils/v3io_clients.py +2 -2
mlrun/utils/version/version.json +2 -2
{mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/METADATA +153 -66
{mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/RECORD +131 -134
{mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/WHEEL +1 -1
mlrun/feature_store/retrieval/conversion.py +0 -271
mlrun/model_monitoring/controller_handler.py +0 -37
mlrun/model_monitoring/evidently_application.py +0 -20
mlrun/model_monitoring/prometheus.py +0 -216
{mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/LICENSE +0 -0
{mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/entry_points.txt +0 -0
{mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/top_level.txt +0 -0

mlrun/datastore/azure_blob.py CHANGED Viewed

@@ -16,12 +16,13 @@ import time
 from pathlib import Path
 from urllib.parse import urlparse
+from azure.storage.blob import BlobServiceClient
 from azure.storage.blob._shared.base_client import parse_connection_str
 from fsspec.registry import get_filesystem_class
 import mlrun.errors
-from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
+from .base import DataStore, FileStats, make_datastore_schema_sanitizer
 # Azure blobs will be represented with the following URL: az://<container name>. The storage account is already
 # pointed to by the connection string, so the user is not expected to specify it in any way.
@@ -29,47 +30,131 @@ from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
 class AzureBlobStore(DataStore):
     using_bucket = True
+    max_concurrency = 100
+    max_blocksize = 1024 * 1024 * 4
+    max_single_put_size = (
+        1024 * 1024 * 8
+    )  # for service_client property only, does not affect filesystem
     def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
         super().__init__(parent, name, schema, endpoint, secrets=secrets)
+        self._service_client = None
+        self._storage_options = None
+    def get_storage_options(self):
+        return self.storage_options
+    @property
+    def storage_options(self):
+        if not self._storage_options:
+            res = dict(
+                account_name=self._get_secret_or_env("account_name")
+                or self._get_secret_or_env("AZURE_STORAGE_ACCOUNT_NAME"),
+                account_key=self._get_secret_or_env("account_key")
+                or self._get_secret_or_env("AZURE_STORAGE_ACCOUNT_KEY"),
+                connection_string=self._get_secret_or_env("connection_string")
+                or self._get_secret_or_env("AZURE_STORAGE_CONNECTION_STRING"),
+                tenant_id=self._get_secret_or_env("tenant_id")
+                or self._get_secret_or_env("AZURE_STORAGE_TENANT_ID"),
+                client_id=self._get_secret_or_env("client_id")
+                or self._get_secret_or_env("AZURE_STORAGE_CLIENT_ID"),
+                client_secret=self._get_secret_or_env("client_secret")
+                or self._get_secret_or_env("AZURE_STORAGE_CLIENT_SECRET"),
+                sas_token=self._get_secret_or_env("sas_token")
+                or self._get_secret_or_env("AZURE_STORAGE_SAS_TOKEN"),
+                credential=self._get_secret_or_env("credential"),
+            )
+            self._storage_options = self._sanitize_storage_options(res)
+        return self._storage_options
     @property
     def filesystem(self):
         """return fsspec file system object, if supported"""
-        if self._filesystem:
-            return self._filesystem
         try:
             import adlfs  # noqa
         except ImportError as exc:
             raise ImportError("Azure adlfs not installed") from exc
-        # in order to support az and wasbs kinds.
-        filesystem_class = get_filesystem_class(protocol=self.kind)
-        self._filesystem = makeDatastoreSchemaSanitizer(
-            filesystem_class,
-            using_bucket=self.using_bucket,
-            **self.get_storage_options(),
-        )
+        if not self._filesystem:
+            # in order to support az and wasbs kinds
+            filesystem_class = get_filesystem_class(protocol=self.kind)
+            self._filesystem = make_datastore_schema_sanitizer(
+                filesystem_class,
+                using_bucket=self.using_bucket,
+                blocksize=self.max_blocksize,
+                **self.storage_options,
+            )
         return self._filesystem
-    def get_storage_options(self):
-        res = dict(
-            account_name=self._get_secret_or_env("account_name")
-            or self._get_secret_or_env("AZURE_STORAGE_ACCOUNT_NAME"),
-            account_key=self._get_secret_or_env("account_key")
-            or self._get_secret_or_env("AZURE_STORAGE_KEY"),
-            connection_string=self._get_secret_or_env("connection_string")
-            or self._get_secret_or_env("AZURE_STORAGE_CONNECTION_STRING"),
-            tenant_id=self._get_secret_or_env("tenant_id")
-            or self._get_secret_or_env("AZURE_STORAGE_TENANT_ID"),
-            client_id=self._get_secret_or_env("client_id")
-            or self._get_secret_or_env("AZURE_STORAGE_CLIENT_ID"),
-            client_secret=self._get_secret_or_env("client_secret")
-            or self._get_secret_or_env("AZURE_STORAGE_CLIENT_SECRET"),
-            sas_token=self._get_secret_or_env("sas_token")
-            or self._get_secret_or_env("AZURE_STORAGE_SAS_TOKEN"),
-            credential=self._get_secret_or_env("credential"),
-        )
-        return self._sanitize_storage_options(res)
+    @property
+    def service_client(self):
+        try:
+            import azure  # noqa
+        except ImportError as exc:
+            raise ImportError("Azure not installed") from exc
+        if not self._service_client:
+            self._do_connect()
+        return self._service_client
+    def _do_connect(self):
+        """
+        Creates a client for azure.
+        Raises MLRunInvalidArgumentError if none of the connection details are available
+        based on do_connect in AzureBlobFileSystem:
+        https://github.com/fsspec/adlfs/blob/2023.9.0/adlfs/spec.py#L422
+        """
+        from azure.identity import ClientSecretCredential
+        storage_options = self.storage_options
+        connection_string = storage_options.get("connection_string")
+        client_name = storage_options.get("account_name")
+        account_key = storage_options.get("account_key")
+        sas_token = storage_options.get("sas_token")
+        client_id = storage_options.get("client_id")
+        credential = storage_options.get("credential")
+        credential_from_client_id = None
+        if (
+            credential is None
+            and account_key is None
+            and sas_token is None
+            and client_id is not None
+        ):
+            credential_from_client_id = ClientSecretCredential(
+                tenant_id=storage_options.get("tenant_id"),
+                client_id=client_id,
+                client_secret=storage_options.get("client_secret"),
+            )
+        try:
+            if connection_string is not None:
+                self._service_client = BlobServiceClient.from_connection_string(
+                    conn_str=connection_string,
+                    max_block_size=self.max_blocksize,
+                    max_single_put_size=self.max_single_put_size,
+                )
+            elif client_name is not None:
+                account_url = f"https://{client_name}.blob.core.windows.net"
+                cred = credential_from_client_id or credential or account_key
+                if not cred and sas_token is not None:
+                    if not sas_token.startswith("?"):
+                        sas_token = f"?{sas_token}"
+                    account_url = account_url + sas_token
+                self._service_client = BlobServiceClient(
+                    account_url=account_url,
+                    credential=cred,
+                    max_block_size=self.max_blocksize,
+                    max_single_put_size=self.max_single_put_size,
+                )
+            else:
+                raise mlrun.errors.MLRunInvalidArgumentError(
+                    "Must provide either a connection_string or account_name with credentials"
+                )
+        except Exception as e:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                f"unable to connect to account for {e}"
+            )
     def _convert_key_to_remote_path(self, key):
         key = key.strip("/")
@@ -82,7 +167,15 @@ class AzureBlobStore(DataStore):
     def upload(self, key, src_path):
         remote_path = self._convert_key_to_remote_path(key)
-        self.filesystem.put_file(src_path, remote_path, overwrite=True)
+        container, remote_path = remote_path.split("/", 1)
+        container_client = self.service_client.get_container_client(container=container)
+        with open(file=src_path, mode="rb") as data:
+            container_client.upload_blob(
+                name=remote_path,
+                data=data,
+                overwrite=True,
+                max_concurrency=self.max_concurrency,
+            )
     def get(self, key, size=None, offset=0):
         remote_path = self._convert_key_to_remote_path(key)
@@ -96,12 +189,7 @@ class AzureBlobStore(DataStore):
                 "Append mode not supported for Azure blob datastore"
             )
         remote_path = self._convert_key_to_remote_path(key)
-        if isinstance(data, bytes):
-            mode = "wb"
-        elif isinstance(data, str):
-            mode = "w"
-        else:
-            raise TypeError("Data type unknown.  Unable to put in Azure!")
+        data, mode = self._prepare_put_data(data, append)
         with self.filesystem.open(remote_path, mode) as f:
             f.write(data)
@@ -135,7 +223,7 @@ class AzureBlobStore(DataStore):
     def get_spark_options(self):
         res = {}
-        st = self.get_storage_options()
+        st = self.storage_options
         service = "blob"
         primary_url = None
         if st.get("connection_string"):

mlrun/datastore/base.py CHANGED Viewed

@@ -24,13 +24,12 @@ import pandas as pd
 import pyarrow
 import pytz
 import requests
-import urllib3
 from deprecated import deprecated
 import mlrun.config
 import mlrun.errors
 from mlrun.errors import err_to_str
-from mlrun.utils import StorePrefix, is_ipython, logger
+from mlrun.utils import StorePrefix, is_jupyter, logger
 from .store_resources import is_store_uri, parse_store_uri
 from .utils import filter_df_start_end_time, select_columns_from_df
@@ -157,6 +156,18 @@ class DataStore:
     def put(self, key, data, append=False):
         pass
+    def _prepare_put_data(self, data, append=False):
+        mode = "a" if append else "w"
+        if isinstance(data, bytearray):
+            data = bytes(data)
+        if isinstance(data, bytes):
+            return data, f"{mode}b"
+        elif isinstance(data, str):
+            return data, mode
+        else:
+            raise TypeError(f"Unable to put a value of type {type(self).__name__}")
     def stat(self, key):
         pass
@@ -215,6 +226,15 @@ class DataStore:
                 raise mlrun.errors.MLRunInvalidArgumentError(
                     "When providing start_time or end_time, must provide time_column"
                 )
+            if (
+                start_time
+                and end_time
+                and start_time.utcoffset() != end_time.utcoffset()
+            ):
+                raise mlrun.errors.MLRunInvalidArgumentError(
+                    "start_time and end_time must have the same time zone"
+                )
             if start_time or end_time or additional_filters:
                 partitions_time_attributes = find_partitions(url, file_system)
                 set_filters(
@@ -232,13 +252,17 @@ class DataStore:
                     ):
                         raise ex
-                    # TODO: fix timezone issue (ML-6308)
-                    if start_time.tzinfo:
-                        start_time_inner = start_time.replace(tzinfo=None)
-                        end_time_inner = end_time.replace(tzinfo=None)
-                    else:
-                        start_time_inner = start_time.replace(tzinfo=pytz.utc)
-                        end_time_inner = end_time.replace(tzinfo=pytz.utc)
+                    start_time_inner = None
+                    if start_time:
+                        start_time_inner = start_time.replace(
+                            tzinfo=None if start_time.tzinfo else pytz.utc
+                        )
+                    end_time_inner = None
+                    if end_time:
+                        end_time_inner = end_time.replace(
+                            tzinfo=None if end_time.tzinfo else pytz.utc
+                        )
                     set_filters(
                         partitions_time_attributes,
@@ -319,11 +343,7 @@ class DataStore:
                                 dfs.append(df_module.read_csv(*updated_args, **kwargs))
                         return df_module.concat(dfs)
-        elif (
-            file_url.endswith(".parquet")
-            or file_url.endswith(".pq")
-            or format == "parquet"
-        ):
+        elif mlrun.utils.helpers.is_parquet_file(file_url, format):
             if columns:
                 kwargs["columns"] = columns
@@ -386,7 +406,10 @@ class DataStore:
         }
     def rm(self, path, recursive=False, maxdepth=None):
-        self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
+        try:
+            self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
+        except FileNotFoundError:
+            pass
     @staticmethod
     def _is_dd(df_module):
@@ -596,14 +619,14 @@ class DataItem:
         )
         return df
-    def show(self, format=None):
+    def show(self, format: Optional[str] = None) -> None:
         """show the data object content in Jupyter
         :param format: format to use (when there is no/wrong suffix), e.g. 'png'
         """
-        if not is_ipython:
+        if not is_jupyter:
             logger.warning(
-                "Jupyter/IPython was not detected, .show() will only display inside Jupyter"
+                "Jupyter was not detected. `.show()` displays only inside Jupyter."
             )
             return
@@ -721,8 +744,6 @@ class HttpStore(DataStore):
         verify_ssl = mlconf.httpdb.http.verify
         try:
-            if not verify_ssl:
-                urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
             response = requests.get(url, headers=headers, auth=auth, verify=verify_ssl)
         except OSError as exc:
             raise OSError(f"error: cannot connect to {url}: {err_to_str(exc)}")
@@ -736,7 +757,7 @@ class HttpStore(DataStore):
 # As an example, it converts an S3 URL 's3://s3bucket/path' to just 's3bucket/path'.
 # Since 'ds' schemas are not inherently processed by fsspec, we have adapted the _strip_protocol()
 # method specifically to strip away the 'ds' schema as required.
-def makeDatastoreSchemaSanitizer(cls, using_bucket=False, *args, **kwargs):
+def make_datastore_schema_sanitizer(cls, using_bucket=False, *args, **kwargs):
     if not issubclass(cls, fsspec.AbstractFileSystem):
         raise ValueError("Class must be a subclass of fsspec.AbstractFileSystem")

mlrun/datastore/datastore.py CHANGED Viewed

@@ -32,6 +32,8 @@ in_memory_store = InMemoryStore()
 def parse_url(url):
+    if url and url.startswith("v3io://") and not url.startswith("v3io:///"):
+        url = url.replace("v3io://", "v3io:///", 1)
     parsed_url = urlparse(url)
     schema = parsed_url.scheme.lower()
     endpoint = parsed_url.hostname
@@ -94,7 +96,7 @@ def schema_to_store(schema):
         from .dbfs_store import DBFSStore
         return DBFSStore
-    elif schema == "hdfs":
+    elif schema in ["hdfs", "webhdfs"]:
         from .hdfs import HdfsStore
         return HdfsStore
@@ -207,7 +209,7 @@ class StoreManager:
     ) -> (DataStore, str, str):
         schema, endpoint, parsed_url = parse_url(url)
         subpath = parsed_url.path
-        store_key = f"{schema}://{endpoint}"
+        store_key = f"{schema}://{endpoint}" if endpoint else f"{schema}://"
         if schema == "ds":
             datastore_profile = datastore_profile_read(url, project_name, secrets)

mlrun/datastore/datastore_profile.py CHANGED Viewed

@@ -412,7 +412,7 @@ class DatastoreProfileHdfs(DatastoreProfile):
         return res or None
     def url(self, subpath):
-        return f"hdfs://{self.host}:{self.http_port}{subpath}"
+        return f"webhdfs://{self.host}:{self.http_port}{subpath}"
 class DatastoreProfile2Json(pydantic.BaseModel):

mlrun/datastore/dbfs_store.py CHANGED Viewed

@@ -19,7 +19,7 @@ from fsspec.registry import get_filesystem_class
 import mlrun.errors
-from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
+from .base import DataStore, FileStats, make_datastore_schema_sanitizer
 class DatabricksFileBugFixed(DatabricksFile):
@@ -89,7 +89,7 @@ class DBFSStore(DataStore):
         """return fsspec file system object, if supported"""
         filesystem_class = get_filesystem_class(protocol=self.kind)
         if not self._filesystem:
-            self._filesystem = makeDatastoreSchemaSanitizer(
+            self._filesystem = make_datastore_schema_sanitizer(
                 cls=filesystem_class,
                 using_bucket=False,
                 **self.get_storage_options(),
@@ -130,11 +130,7 @@ class DBFSStore(DataStore):
                 "Append mode not supported for Databricks file system"
             )
         #  can not use append mode because it overrides data.
-        mode = "w"
-        if isinstance(data, bytes):
-            mode += "b"
-        elif not isinstance(data, str):
-            raise TypeError(f"Unknown data type {type(data)}")
+        data, mode = self._prepare_put_data(data, append)
         with self.filesystem.open(key, mode) as f:
             f.write(data)

mlrun/datastore/filestore.py CHANGED Viewed

@@ -66,9 +66,7 @@ class FileStore(DataStore):
         dir_to_create = path.dirname(self._join(key))
         if dir_to_create:
             self._ensure_directory(dir_to_create)
-        mode = "a" if append else "w"
-        if isinstance(data, bytes):
-            mode = mode + "b"
+        data, mode = self._prepare_put_data(data, append)
         with open(self._join(key), mode) as fp:
             fp.write(data)
             fp.close()

mlrun/datastore/google_cloud_storage.py CHANGED Viewed

@@ -12,44 +12,82 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import json
+import os
 from pathlib import Path
 from fsspec.registry import get_filesystem_class
+from google.auth.credentials import Credentials
+from google.cloud.storage import Client, transfer_manager
+from google.oauth2 import service_account
 import mlrun.errors
 from mlrun.utils import logger
-from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
+from .base import DataStore, FileStats, make_datastore_schema_sanitizer
 # Google storage objects will be represented with the following URL: gcs://<bucket name>/<path> or gs://...
 class GoogleCloudStorageStore(DataStore):
     using_bucket = True
+    workers = 8
+    chunk_size = 32 * 1024 * 1024
     def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
         super().__init__(parent, name, schema, endpoint, secrets=secrets)
+        self._storage_client = None
+        self._storage_options = None
+    @property
+    def storage_client(self):
+        if self._storage_client:
+            return self._storage_client
+        token = self._get_credentials().get("token")
+        access = "https://www.googleapis.com/auth/devstorage.full_control"
+        if isinstance(token, str):
+            if os.path.exists(token):
+                credentials = service_account.Credentials.from_service_account_file(
+                    token, scopes=[access]
+                )
+            else:
+                raise mlrun.errors.MLRunInvalidArgumentError(
+                    "gcsfs authentication file not found!"
+                )
+        elif isinstance(token, dict):
+            credentials = service_account.Credentials.from_service_account_info(
+                token, scopes=[access]
+            )
+        elif isinstance(token, Credentials):
+            credentials = token
+        else:
+            raise ValueError(f"Unsupported token type: {type(token)}")
+        self._storage_client = Client(credentials=credentials)
+        return self._storage_client
     @property
     def filesystem(self):
         """return fsspec file system object, if supported"""
-        if self._filesystem:
-            return self._filesystem
-        try:
-            import gcsfs  # noqa
-        except ImportError as exc:
-            raise ImportError(
-                "Google gcsfs not installed, run pip install gcsfs"
-            ) from exc
-        filesystem_class = get_filesystem_class(protocol=self.kind)
-        self._filesystem = makeDatastoreSchemaSanitizer(
-            filesystem_class,
-            using_bucket=self.using_bucket,
-            **self.get_storage_options(),
-        )
+        if not self._filesystem:
+            filesystem_class = get_filesystem_class(protocol=self.kind)
+            self._filesystem = make_datastore_schema_sanitizer(
+                filesystem_class,
+                using_bucket=self.using_bucket,
+                **self.storage_options,
+            )
         return self._filesystem
-    def get_storage_options(self):
+    @property
+    def storage_options(self):
+        if self._storage_options:
+            return self._storage_options
+        credentials = self._get_credentials()
+        # due to caching problem introduced in gcsfs 2024.3.1 (ML-7636)
+        credentials["use_listings_cache"] = False
+        self._storage_options = credentials
+        return self._storage_options
+    def _get_credentials(self):
         credentials = self._get_secret_or_env(
             "GCP_CREDENTIALS"
         ) or self._get_secret_or_env("GOOGLE_APPLICATION_CREDENTIALS")
@@ -71,6 +109,9 @@ class GoogleCloudStorageStore(DataStore):
             )
             return self._sanitize_storage_options(None)
+    def get_storage_options(self):
+        return self.storage_options
     def _make_path(self, key):
         key = key.strip("/")
         path = Path(self.endpoint, key).as_posix()
@@ -90,21 +131,34 @@ class GoogleCloudStorageStore(DataStore):
             raise mlrun.errors.MLRunInvalidArgumentError(
                 "Append mode not supported for Google cloud storage datastore"
             )
-        if isinstance(data, bytes):
-            mode = "wb"
-        elif isinstance(data, str):
-            mode = "w"
-        else:
-            raise TypeError(
-                "Data type unknown.  Unable to put in Google cloud storage!"
-            )
+        data, mode = self._prepare_put_data(data, append)
         with self.filesystem.open(path, mode) as f:
             f.write(data)
     def upload(self, key, src_path):
-        path = self._make_path(key)
-        self.filesystem.put_file(src_path, path, overwrite=True)
+        file_size = os.path.getsize(src_path)
+        united_path = self._make_path(key)
+        # Multiple upload limitation recommendations as described in
+        # https://cloud.google.com/storage/docs/multipart-uploads#storage-upload-object-chunks-python
+        if file_size <= self.chunk_size:
+            self.filesystem.put_file(src_path, united_path, overwrite=True)
+            return
+        bucket = self.storage_client.bucket(self.endpoint)
+        blob = bucket.blob(key.strip("/"))
+        try:
+            transfer_manager.upload_chunks_concurrently(
+                src_path, blob, chunk_size=self.chunk_size, max_workers=self.workers
+            )
+        except Exception as upload_chunks_concurrently_exception:
+            logger.warning(
+                f"gcs: failed to concurrently upload {src_path},"
+                f" exception: {upload_chunks_concurrently_exception}. Retrying with single part upload."
+            )
+            self.filesystem.put_file(src_path, united_path, overwrite=True)
     def stat(self, key):
         path = self._make_path(key)
@@ -133,11 +187,13 @@ class GoogleCloudStorageStore(DataStore):
     def rm(self, path, recursive=False, maxdepth=None):
         path = self._make_path(path)
-        self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
+        # in order to raise an error in case of a connection error (ML-7056)
+        self.filesystem.exists(path)
+        super().rm(path, recursive=recursive, maxdepth=maxdepth)
     def get_spark_options(self):
         res = {}
-        st = self.get_storage_options()
+        st = self._get_credentials()
         if "token" in st:
             res = {"spark.hadoop.google.cloud.auth.service.account.enable": "true"}
             if isinstance(st["token"], str):

mlrun/datastore/inmem.py CHANGED Viewed

@@ -72,7 +72,7 @@ class InMemoryStore(DataStore):
             if columns:
                 kwargs["usecols"] = columns
             reader = df_module.read_csv
-        elif url.endswith(".parquet") or url.endswith(".pq") or format == "parquet":
+        elif mlrun.utils.helpers.is_parquet_file(url, format):
             if columns:
                 kwargs["columns"] = columns
             reader = df_module.read_parquet
@@ -85,3 +85,6 @@ class InMemoryStore(DataStore):
             kwargs.pop(field, None)
         return reader(item, **kwargs)
+    def rm(self, path, recursive=False, maxdepth=None):
+        self._items.pop(path, None)

mlrun/datastore/redis.py CHANGED Viewed

@@ -126,6 +126,7 @@ class RedisStore(DataStore):
     def put(self, key, data, append=False):
         key = RedisStore.build_redis_key(key)
+        data, _ = self._prepare_put_data(data, append)
         if append:
             self.redis.append(key, data)
         else:

mlrun 1.7.0rc28__py3-none-any.whl → 1.7.0rc55__py3-none-any.whl

Potentially problematic release.

mlrun 1.7.0rc28py3-none-any.whl → 1.7.0rc55py3-none-any.whl