PyPI - mlrun - Versions diffs - 1.7.0rc4__py3-none-any.whl → 1.7.2__py3-none-any.whl - Mend

mlrun 1.7.0rc4py3-none-any.whl → 1.7.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (235) hide show

mlrun/__init__.py +11 -1
mlrun/__main__.py +39 -121
mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
mlrun/alerts/alert.py +248 -0
mlrun/api/schemas/__init__.py +4 -3
mlrun/artifacts/__init__.py +8 -3
mlrun/artifacts/base.py +39 -254
mlrun/artifacts/dataset.py +9 -190
mlrun/artifacts/manager.py +73 -46
mlrun/artifacts/model.py +30 -158
mlrun/artifacts/plots.py +23 -380
mlrun/common/constants.py +73 -1
mlrun/common/db/sql_session.py +3 -2
mlrun/common/formatters/__init__.py +21 -0
mlrun/common/formatters/artifact.py +46 -0
mlrun/common/formatters/base.py +113 -0
mlrun/common/formatters/feature_set.py +44 -0
mlrun/common/formatters/function.py +46 -0
mlrun/common/formatters/pipeline.py +53 -0
mlrun/common/formatters/project.py +51 -0
mlrun/common/formatters/run.py +29 -0
mlrun/common/helpers.py +11 -1
mlrun/{runtimes → common/runtimes}/constants.py +32 -4
mlrun/common/schemas/__init__.py +31 -4
mlrun/common/schemas/alert.py +202 -0
mlrun/common/schemas/api_gateway.py +196 -0
mlrun/common/schemas/artifact.py +28 -1
mlrun/common/schemas/auth.py +13 -2
mlrun/common/schemas/client_spec.py +2 -1
mlrun/common/schemas/common.py +7 -4
mlrun/common/schemas/constants.py +3 -0
mlrun/common/schemas/feature_store.py +58 -28
mlrun/common/schemas/frontend_spec.py +8 -0
mlrun/common/schemas/function.py +11 -0
mlrun/common/schemas/hub.py +7 -9
mlrun/common/schemas/model_monitoring/__init__.py +21 -4
mlrun/common/schemas/model_monitoring/constants.py +136 -42
mlrun/common/schemas/model_monitoring/grafana.py +9 -5
mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
mlrun/common/schemas/notification.py +69 -12
mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
mlrun/common/schemas/pipeline.py +7 -0
mlrun/common/schemas/project.py +67 -16
mlrun/common/schemas/runs.py +17 -0
mlrun/common/schemas/schedule.py +1 -1
mlrun/common/schemas/workflow.py +10 -2
mlrun/common/types.py +14 -1
mlrun/config.py +233 -58
mlrun/data_types/data_types.py +11 -1
mlrun/data_types/spark.py +5 -4
mlrun/data_types/to_pandas.py +75 -34
mlrun/datastore/__init__.py +8 -10
mlrun/datastore/alibaba_oss.py +131 -0
mlrun/datastore/azure_blob.py +131 -43
mlrun/datastore/base.py +107 -47
mlrun/datastore/datastore.py +17 -7
mlrun/datastore/datastore_profile.py +91 -7
mlrun/datastore/dbfs_store.py +3 -7
mlrun/datastore/filestore.py +1 -3
mlrun/datastore/google_cloud_storage.py +92 -32
mlrun/datastore/hdfs.py +5 -0
mlrun/datastore/inmem.py +6 -3
mlrun/datastore/redis.py +3 -2
mlrun/datastore/s3.py +30 -12
mlrun/datastore/snowflake_utils.py +45 -0
mlrun/datastore/sources.py +274 -59
mlrun/datastore/spark_utils.py +30 -0
mlrun/datastore/store_resources.py +9 -7
mlrun/datastore/storeytargets.py +151 -0
mlrun/datastore/targets.py +387 -119
mlrun/datastore/utils.py +68 -5
mlrun/datastore/v3io.py +28 -50
mlrun/db/auth_utils.py +152 -0
mlrun/db/base.py +245 -20
mlrun/db/factory.py +1 -4
mlrun/db/httpdb.py +909 -231
mlrun/db/nopdb.py +279 -14
mlrun/errors.py +35 -5
mlrun/execution.py +111 -38
mlrun/feature_store/__init__.py +0 -2
mlrun/feature_store/api.py +46 -53
mlrun/feature_store/common.py +6 -11
mlrun/feature_store/feature_set.py +48 -23
mlrun/feature_store/feature_vector.py +13 -2
mlrun/feature_store/ingestion.py +7 -6
mlrun/feature_store/retrieval/base.py +9 -4
mlrun/feature_store/retrieval/dask_merger.py +2 -0
mlrun/feature_store/retrieval/job.py +13 -4
mlrun/feature_store/retrieval/local_merger.py +2 -0
mlrun/feature_store/retrieval/spark_merger.py +24 -32
mlrun/feature_store/steps.py +38 -19
mlrun/features.py +6 -14
mlrun/frameworks/_common/plan.py +3 -3
mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
mlrun/frameworks/_ml_common/plan.py +1 -1
mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
mlrun/frameworks/lgbm/__init__.py +1 -1
mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
mlrun/frameworks/lgbm/model_handler.py +1 -1
mlrun/frameworks/parallel_coordinates.py +4 -4
mlrun/frameworks/pytorch/__init__.py +2 -2
mlrun/frameworks/sklearn/__init__.py +1 -1
mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
mlrun/frameworks/tf_keras/__init__.py +5 -2
mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
mlrun/frameworks/xgboost/__init__.py +1 -1
mlrun/k8s_utils.py +57 -12
mlrun/launcher/__init__.py +1 -1
mlrun/launcher/base.py +6 -5
mlrun/launcher/client.py +13 -11
mlrun/launcher/factory.py +1 -1
mlrun/launcher/local.py +15 -5
mlrun/launcher/remote.py +10 -3
mlrun/lists.py +6 -2
mlrun/model.py +297 -48
mlrun/model_monitoring/__init__.py +1 -1
mlrun/model_monitoring/api.py +152 -357
mlrun/model_monitoring/applications/__init__.py +10 -0
mlrun/model_monitoring/applications/_application_steps.py +190 -0
mlrun/model_monitoring/applications/base.py +108 -0
mlrun/model_monitoring/applications/context.py +341 -0
mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
mlrun/model_monitoring/applications/results.py +99 -0
mlrun/model_monitoring/controller.py +130 -303
mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
mlrun/model_monitoring/db/stores/__init__.py +136 -0
mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
mlrun/model_monitoring/db/stores/base/store.py +213 -0
mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
mlrun/model_monitoring/db/tsdb/base.py +448 -0
mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
mlrun/model_monitoring/features_drift_table.py +34 -22
mlrun/model_monitoring/helpers.py +177 -39
mlrun/model_monitoring/model_endpoint.py +3 -2
mlrun/model_monitoring/stream_processing.py +165 -398
mlrun/model_monitoring/tracking_policy.py +7 -1
mlrun/model_monitoring/writer.py +161 -125
mlrun/package/packagers/default_packager.py +2 -2
mlrun/package/packagers_manager.py +1 -0
mlrun/package/utils/_formatter.py +2 -2
mlrun/platforms/__init__.py +11 -10
mlrun/platforms/iguazio.py +67 -228
mlrun/projects/__init__.py +6 -1
mlrun/projects/operations.py +47 -20
mlrun/projects/pipelines.py +396 -249
mlrun/projects/project.py +1176 -406
mlrun/render.py +28 -22
mlrun/run.py +208 -181
mlrun/runtimes/__init__.py +76 -11
mlrun/runtimes/base.py +54 -24
mlrun/runtimes/daskjob.py +9 -2
mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
mlrun/runtimes/funcdoc.py +1 -29
mlrun/runtimes/kubejob.py +34 -128
mlrun/runtimes/local.py +39 -10
mlrun/runtimes/mpijob/__init__.py +0 -20
mlrun/runtimes/mpijob/abstract.py +8 -8
mlrun/runtimes/mpijob/v1.py +1 -1
mlrun/runtimes/nuclio/__init__.py +1 -0
mlrun/runtimes/nuclio/api_gateway.py +769 -0
mlrun/runtimes/nuclio/application/__init__.py +15 -0
mlrun/runtimes/nuclio/application/application.py +758 -0
mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
mlrun/runtimes/nuclio/function.py +188 -68
mlrun/runtimes/nuclio/serving.py +57 -60
mlrun/runtimes/pod.py +191 -58
mlrun/runtimes/remotesparkjob.py +11 -8
mlrun/runtimes/sparkjob/spark3job.py +17 -18
mlrun/runtimes/utils.py +40 -73
mlrun/secrets.py +6 -2
mlrun/serving/__init__.py +8 -1
mlrun/serving/remote.py +2 -3
mlrun/serving/routers.py +89 -64
mlrun/serving/server.py +54 -26
mlrun/serving/states.py +187 -56
mlrun/serving/utils.py +19 -11
mlrun/serving/v2_serving.py +136 -63
mlrun/track/tracker.py +2 -1
mlrun/track/trackers/mlflow_tracker.py +5 -0
mlrun/utils/async_http.py +26 -6
mlrun/utils/db.py +18 -0
mlrun/utils/helpers.py +375 -105
mlrun/utils/http.py +2 -2
mlrun/utils/logger.py +75 -9
mlrun/utils/notifications/notification/__init__.py +14 -10
mlrun/utils/notifications/notification/base.py +48 -0
mlrun/utils/notifications/notification/console.py +2 -0
mlrun/utils/notifications/notification/git.py +24 -1
mlrun/utils/notifications/notification/ipython.py +2 -0
mlrun/utils/notifications/notification/slack.py +96 -21
mlrun/utils/notifications/notification/webhook.py +63 -2
mlrun/utils/notifications/notification_pusher.py +146 -16
mlrun/utils/regex.py +9 -0
mlrun/utils/retryer.py +3 -2
mlrun/utils/v3io_clients.py +2 -3
mlrun/utils/version/version.json +2 -2
mlrun-1.7.2.dist-info/METADATA +390 -0
mlrun-1.7.2.dist-info/RECORD +351 -0
{mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
mlrun/feature_store/retrieval/conversion.py +0 -271
mlrun/kfpops.py +0 -868
mlrun/model_monitoring/application.py +0 -310
mlrun/model_monitoring/batch.py +0 -974
mlrun/model_monitoring/controller_handler.py +0 -37
mlrun/model_monitoring/prometheus.py +0 -216
mlrun/model_monitoring/stores/__init__.py +0 -111
mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
mlrun/model_monitoring/stores/models/__init__.py +0 -27
mlrun/model_monitoring/stores/models/base.py +0 -84
mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
mlrun/platforms/other.py +0 -305
mlrun-1.7.0rc4.dist-info/METADATA +0 -269
mlrun-1.7.0rc4.dist-info/RECORD +0 -321
{mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
{mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
{mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0

mlrun/data_types/to_pandas.py CHANGED Viewed

@@ -15,23 +15,13 @@
 import warnings
 from collections import Counter
-from pyspark.sql.types import (
-    BooleanType,
-    ByteType,
-    DoubleType,
-    FloatType,
-    IntegerType,
-    IntegralType,
-    LongType,
-    MapType,
-    ShortType,
-    TimestampType,
-)
-def toPandas(spark_df):
+import pandas as pd
+import semver
+def _to_pandas(spark_df):
     """
-    Modified version of spark DataFrame.toPandas() –
+    Modified version of spark DataFrame.toPandas() -
     https://github.com/apache/spark/blob/v3.2.3/python/pyspark/sql/pandas/conversion.py#L35
     The original code (which is only replaced in pyspark 3.5.0) fails with Pandas 2 installed, with the following error:
@@ -40,6 +30,12 @@ def toPandas(spark_df):
     This modification adds the missing unit to the dtype.
     """
     from pyspark.sql.dataframe import DataFrame
+    from pyspark.sql.types import (
+        BooleanType,
+        IntegralType,
+        MapType,
+        TimestampType,
+    )
     assert isinstance(spark_df, DataFrame)
@@ -48,7 +44,6 @@ def toPandas(spark_df):
     require_minimum_pandas_version()
     import numpy as np
-    import pandas as pd
     timezone = spark_df.sql_ctx._conf.sessionLocalTimeZone()
@@ -65,10 +60,10 @@ def toPandas(spark_df):
                 msg = (
                     "toPandas attempted Arrow optimization because "
                     "'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, "
-                    "failed by the reason below:\n  %s\n"
+                    f"failed by the reason below:\n  {e}\n"
                     "Attempting non-optimization as "
                     "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "
-                    "true." % str(e)
+                    "true."
                 )
                 warnings.warn(msg)
                 use_arrow = False
@@ -78,7 +73,7 @@ def toPandas(spark_df):
                     "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
                     "reached the error below and will not continue because automatic fallback "
                     "with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to "
-                    "false.\n  %s" % str(e)
+                    f"false.\n  {e}"
                 )
                 warnings.warn(msg)
                 raise
@@ -144,7 +139,7 @@ def toPandas(spark_df):
                     "reached the error below and can not continue. Note that "
                     "'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an "
                     "effect on failures in the middle of "
-                    "computation.\n  %s" % str(e)
+                    f"computation.\n  {e}"
                 )
                 warnings.warn(msg)
                 raise
@@ -154,10 +149,10 @@ def toPandas(spark_df):
     column_counter = Counter(spark_df.columns)
     dtype = [None] * len(spark_df.schema)
-    for fieldIdx, field in enumerate(spark_df.schema):
+    for field_idx, field in enumerate(spark_df.schema):
         # For duplicate column name, we use `iloc` to access it.
         if column_counter[field.name] > 1:
-            pandas_col = pdf.iloc[:, fieldIdx]
+            pandas_col = pdf.iloc[:, field_idx]
         else:
             pandas_col = pdf[field.name]
@@ -171,12 +166,12 @@ def toPandas(spark_df):
             and field.nullable
             and pandas_col.isnull().any()
         ):
-            dtype[fieldIdx] = pandas_type
+            dtype[field_idx] = pandas_type
         # Ensure we fall back to nullable numpy types, even when whole column is null:
         if isinstance(field.dataType, IntegralType) and pandas_col.isnull().any():
-            dtype[fieldIdx] = np.float64
+            dtype[field_idx] = np.float64
         if isinstance(field.dataType, BooleanType) and pandas_col.isnull().any():
-            dtype[fieldIdx] = object
+            dtype[field_idx] = object
     df = pd.DataFrame()
     for index, t in enumerate(dtype):
@@ -217,22 +212,68 @@ def toPandas(spark_df):
 def _to_corrected_pandas_type(dt):
     import numpy as np
+    from pyspark.sql.types import (
+        BooleanType,
+        ByteType,
+        DoubleType,
+        FloatType,
+        IntegerType,
+        LongType,
+        ShortType,
+        TimestampType,
+    )
-    if type(dt) == ByteType:
+    if isinstance(dt, ByteType):
         return np.int8
-    elif type(dt) == ShortType:
+    elif isinstance(dt, ShortType):
         return np.int16
-    elif type(dt) == IntegerType:
+    elif isinstance(dt, IntegerType):
         return np.int32
-    elif type(dt) == LongType:
+    elif isinstance(dt, LongType):
         return np.int64
-    elif type(dt) == FloatType:
+    elif isinstance(dt, FloatType):
         return np.float32
-    elif type(dt) == DoubleType:
+    elif isinstance(dt, DoubleType):
         return np.float64
-    elif type(dt) == BooleanType:
+    elif isinstance(dt, BooleanType):
         return bool
-    elif type(dt) == TimestampType:
+    elif isinstance(dt, TimestampType):
         return "datetime64[ns]"
     else:
         return None
+def spark_df_to_pandas(spark_df):
+    import pyspark
+    if semver.parse(pyspark.__version__) >= semver.Version(3, 5, 0):
+        def to_pandas(spark_df_inner):
+            return spark_df_inner.toPandas()
+    else:
+        to_pandas = _to_pandas
+    # as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
+    # when we upgrade pyspark, we should check whether this workaround is still necessary
+    # see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
+    if semver.parse(pd.__version__)["major"] >= 2:
+        import pyspark.sql.functions as pyspark_functions
+        type_conversion_dict = {}
+        for field in spark_df.schema.fields:
+            if str(field.dataType) == "TimestampType":
+                spark_df = spark_df.withColumn(
+                    field.name,
+                    pyspark_functions.date_format(
+                        pyspark_functions.to_timestamp(field.name),
+                        "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
+                    ),
+                )
+                type_conversion_dict[field.name] = "datetime64[ns]"
+        df = to_pandas(spark_df)
+        if type_conversion_dict:
+            df = df.astype(type_conversion_dict)
+        return df
+    else:
+        return to_pandas(spark_df)

mlrun/datastore/__init__.py CHANGED Viewed

@@ -64,7 +64,7 @@ from .store_resources import (
     parse_store_uri,
 )
 from .targets import CSVTarget, NoSqlTarget, ParquetTarget, StreamTarget
-from .utils import parse_kafka_url
+from .utils import get_kafka_brokers_from_dict, parse_kafka_url
 store_manager = StoreManager()
@@ -107,19 +107,17 @@ def get_stream_pusher(stream_path: str, **kwargs):
     :param stream_path:        path/url of stream
     """
-    if stream_path.startswith("kafka://") or "kafka_bootstrap_servers" in kwargs:
-        topic, bootstrap_servers = parse_kafka_url(
-            stream_path, kwargs.get("kafka_bootstrap_servers")
-        )
-        return KafkaOutputStream(
-            topic, bootstrap_servers, kwargs.get("kafka_producer_options")
-        )
+    kafka_brokers = get_kafka_brokers_from_dict(kwargs)
+    if stream_path.startswith("kafka://") or kafka_brokers:
+        topic, brokers = parse_kafka_url(stream_path, kafka_brokers)
+        return KafkaOutputStream(topic, brokers, kwargs.get("kafka_producer_options"))
     elif stream_path.startswith("http://") or stream_path.startswith("https://"):
         return HTTPOutputStream(stream_path=stream_path)
     elif "://" not in stream_path:
         return OutputStream(stream_path, **kwargs)
     elif stream_path.startswith("v3io"):
         endpoint, stream_path = parse_path(stream_path)
+        endpoint = kwargs.pop("endpoint", None) or endpoint
         return OutputStream(stream_path, endpoint=endpoint, **kwargs)
     elif stream_path.startswith("dummy://"):
         return _DummyStream(**kwargs)
@@ -133,9 +131,9 @@ class _DummyStream:
     def __init__(self, event_list=None, **kwargs):
         self.event_list = event_list or []
-    def push(self, data):
+    def push(self, data, **kwargs):
         if not isinstance(data, list):
             data = [data]
         for item in data:
-            logger.info(f"dummy stream got event: {item}")
+            logger.info(f"dummy stream got event: {item}, kwargs={kwargs}")
             self.event_list.append(item)

mlrun/datastore/alibaba_oss.py ADDED Viewed

@@ -0,0 +1,131 @@
+# Copyright 2023 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+from datetime import datetime
+from pathlib import Path
+from urllib.parse import urlparse
+import oss2
+from fsspec.registry import get_filesystem_class
+import mlrun.errors
+from .base import DataStore, FileStats, make_datastore_schema_sanitizer
+class OSSStore(DataStore):
+    using_bucket = True
+    def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
+        super().__init__(parent, name, schema, endpoint, secrets)
+        # will be used in case user asks to assume a role and work through fsspec
+        access_key_id = self._get_secret_or_env("ALIBABA_ACCESS_KEY_ID")
+        secret_key = self._get_secret_or_env("ALIBABA_SECRET_ACCESS_KEY")
+        endpoint_url = self._get_secret_or_env("ALIBABA_ENDPOINT_URL")
+        if access_key_id and secret_key and endpoint_url:
+            self.auth = oss2.Auth(access_key_id, secret_key)
+            self.endpoint_url = endpoint_url
+        else:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                "missing ALIBABA_ACCESS_KEY_ID or ALIBABA_SECRET_ACCESS_KEY ALIBABA_ENDPOINT_URL in environment"
+            )
+    @property
+    def filesystem(self):
+        """return fsspec file system object, if supported"""
+        if self._filesystem:
+            return self._filesystem
+        try:
+            import ossfs  # noqa
+        except ImportError as exc:
+            raise ImportError("ALIBABA ossfs not installed") from exc
+        filesystem_class = get_filesystem_class(protocol=self.kind)
+        self._filesystem = make_datastore_schema_sanitizer(
+            filesystem_class,
+            using_bucket=self.using_bucket,
+            **self.get_storage_options(),
+        )
+        return self._filesystem
+    def get_storage_options(self):
+        res = dict(
+            endpoint=self._get_secret_or_env("ALIBABA_ENDPOINT_URL"),
+            key=self._get_secret_or_env("ALIBABA_ACCESS_KEY_ID"),
+            secret=self._get_secret_or_env("ALIBABA_SECRET_ACCESS_KEY"),
+        )
+        return self._sanitize_storage_options(res)
+    def get_bucket_and_key(self, key):
+        path = self._join(key)[1:]
+        return self.endpoint, path
+    def upload(self, key, src_path):
+        bucket, key = self.get_bucket_and_key(key)
+        oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
+        oss.put_object(key, open(src_path, "rb"))
+    def get(self, key, size=None, offset=0):
+        bucket, key = self.get_bucket_and_key(key)
+        oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
+        if size or offset:
+            return oss.get_object(key, byte_range=self.get_range(size, offset)).read()
+        return oss.get_object(key).read()
+    def put(self, key, data, append=False):
+        data, _ = self._prepare_put_data(data, append)
+        bucket, key = self.get_bucket_and_key(key)
+        oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
+        oss.put_object(key, data)
+    def stat(self, key):
+        bucket, key = self.get_bucket_and_key(key)
+        oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
+        obj = oss.get_object_meta(key)
+        size = obj.content_length
+        modified = datetime.fromtimestamp(obj.last_modified)
+        return FileStats(size, time.mktime(modified.timetuple()))
+    def listdir(self, key):
+        remote_path = self._convert_key_to_remote_path(key)
+        if self.filesystem.isfile(remote_path):
+            return key
+        remote_path = f"{remote_path}/**"
+        files = self.filesystem.glob(remote_path)
+        key_length = len(key)
+        files = [
+            f.split("/", 1)[1][key_length:] for f in files if len(f.split("/")) > 1
+        ]
+        return files
+    def delete(self, key):
+        bucket, key = self.get_bucket_and_key(key)
+        oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
+        oss.delete_object(key)
+    def _convert_key_to_remote_path(self, key):
+        key = key.strip("/")
+        schema = urlparse(key).scheme
+        #  if called without passing dataitem - like in fset.purge_targets,
+        #  key will include schema.
+        if not schema:
+            key = Path(self.endpoint, key).as_posix()
+        return key
+    @staticmethod
+    def get_range(size, offset):
+        if size:
+            return [offset, size]
+        return [offset, None]

mlrun/datastore/azure_blob.py CHANGED Viewed

@@ -16,12 +16,13 @@ import time
 from pathlib import Path
 from urllib.parse import urlparse
+from azure.storage.blob import BlobServiceClient
 from azure.storage.blob._shared.base_client import parse_connection_str
 from fsspec.registry import get_filesystem_class
 import mlrun.errors
-from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
+from .base import DataStore, FileStats, make_datastore_schema_sanitizer
 # Azure blobs will be represented with the following URL: az://<container name>. The storage account is already
 # pointed to by the connection string, so the user is not expected to specify it in any way.
@@ -29,47 +30,131 @@ from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
 class AzureBlobStore(DataStore):
     using_bucket = True
+    max_concurrency = 100
+    max_blocksize = 1024 * 1024 * 4
+    max_single_put_size = (
+        1024 * 1024 * 8
+    )  # for service_client property only, does not affect filesystem
     def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
         super().__init__(parent, name, schema, endpoint, secrets=secrets)
+        self._service_client = None
+        self._storage_options = None
+    def get_storage_options(self):
+        return self.storage_options
+    @property
+    def storage_options(self):
+        if not self._storage_options:
+            res = dict(
+                account_name=self._get_secret_or_env("account_name")
+                or self._get_secret_or_env("AZURE_STORAGE_ACCOUNT_NAME"),
+                account_key=self._get_secret_or_env("account_key")
+                or self._get_secret_or_env("AZURE_STORAGE_ACCOUNT_KEY"),
+                connection_string=self._get_secret_or_env("connection_string")
+                or self._get_secret_or_env("AZURE_STORAGE_CONNECTION_STRING"),
+                tenant_id=self._get_secret_or_env("tenant_id")
+                or self._get_secret_or_env("AZURE_STORAGE_TENANT_ID"),
+                client_id=self._get_secret_or_env("client_id")
+                or self._get_secret_or_env("AZURE_STORAGE_CLIENT_ID"),
+                client_secret=self._get_secret_or_env("client_secret")
+                or self._get_secret_or_env("AZURE_STORAGE_CLIENT_SECRET"),
+                sas_token=self._get_secret_or_env("sas_token")
+                or self._get_secret_or_env("AZURE_STORAGE_SAS_TOKEN"),
+                credential=self._get_secret_or_env("credential"),
+            )
+            self._storage_options = self._sanitize_storage_options(res)
+        return self._storage_options
     @property
     def filesystem(self):
         """return fsspec file system object, if supported"""
-        if self._filesystem:
-            return self._filesystem
         try:
             import adlfs  # noqa
         except ImportError as exc:
             raise ImportError("Azure adlfs not installed") from exc
-        # in order to support az and wasbs kinds.
-        filesystem_class = get_filesystem_class(protocol=self.kind)
-        self._filesystem = makeDatastoreSchemaSanitizer(
-            filesystem_class,
-            using_bucket=self.using_bucket,
-            **self.get_storage_options(),
-        )
+        if not self._filesystem:
+            # in order to support az and wasbs kinds
+            filesystem_class = get_filesystem_class(protocol=self.kind)
+            self._filesystem = make_datastore_schema_sanitizer(
+                filesystem_class,
+                using_bucket=self.using_bucket,
+                blocksize=self.max_blocksize,
+                **self.storage_options,
+            )
         return self._filesystem
-    def get_storage_options(self):
-        res = dict(
-            account_name=self._get_secret_or_env("account_name")
-            or self._get_secret_or_env("AZURE_STORAGE_ACCOUNT_NAME"),
-            account_key=self._get_secret_or_env("account_key")
-            or self._get_secret_or_env("AZURE_STORAGE_KEY"),
-            connection_string=self._get_secret_or_env("connection_string")
-            or self._get_secret_or_env("AZURE_STORAGE_CONNECTION_STRING"),
-            tenant_id=self._get_secret_or_env("tenant_id")
-            or self._get_secret_or_env("AZURE_STORAGE_TENANT_ID"),
-            client_id=self._get_secret_or_env("client_id")
-            or self._get_secret_or_env("AZURE_STORAGE_CLIENT_ID"),
-            client_secret=self._get_secret_or_env("client_secret")
-            or self._get_secret_or_env("AZURE_STORAGE_CLIENT_SECRET"),
-            sas_token=self._get_secret_or_env("sas_token")
-            or self._get_secret_or_env("AZURE_STORAGE_SAS_TOKEN"),
-            credential=self._get_secret_or_env("credential"),
-        )
-        return self._sanitize_storage_options(res)
+    @property
+    def service_client(self):
+        try:
+            import azure  # noqa
+        except ImportError as exc:
+            raise ImportError("Azure not installed") from exc
+        if not self._service_client:
+            self._do_connect()
+        return self._service_client
+    def _do_connect(self):
+        """
+        Creates a client for azure.
+        Raises MLRunInvalidArgumentError if none of the connection details are available
+        based on do_connect in AzureBlobFileSystem:
+        https://github.com/fsspec/adlfs/blob/2023.9.0/adlfs/spec.py#L422
+        """
+        from azure.identity import ClientSecretCredential
+        storage_options = self.storage_options
+        connection_string = storage_options.get("connection_string")
+        client_name = storage_options.get("account_name")
+        account_key = storage_options.get("account_key")
+        sas_token = storage_options.get("sas_token")
+        client_id = storage_options.get("client_id")
+        credential = storage_options.get("credential")
+        credential_from_client_id = None
+        if (
+            credential is None
+            and account_key is None
+            and sas_token is None
+            and client_id is not None
+        ):
+            credential_from_client_id = ClientSecretCredential(
+                tenant_id=storage_options.get("tenant_id"),
+                client_id=client_id,
+                client_secret=storage_options.get("client_secret"),
+            )
+        try:
+            if connection_string is not None:
+                self._service_client = BlobServiceClient.from_connection_string(
+                    conn_str=connection_string,
+                    max_block_size=self.max_blocksize,
+                    max_single_put_size=self.max_single_put_size,
+                )
+            elif client_name is not None:
+                account_url = f"https://{client_name}.blob.core.windows.net"
+                cred = credential_from_client_id or credential or account_key
+                if not cred and sas_token is not None:
+                    if not sas_token.startswith("?"):
+                        sas_token = f"?{sas_token}"
+                    account_url = account_url + sas_token
+                self._service_client = BlobServiceClient(
+                    account_url=account_url,
+                    credential=cred,
+                    max_block_size=self.max_blocksize,
+                    max_single_put_size=self.max_single_put_size,
+                )
+            else:
+                raise mlrun.errors.MLRunInvalidArgumentError(
+                    "Must provide either a connection_string or account_name with credentials"
+                )
+        except Exception as e:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                f"unable to connect to account for {e}"
+            )
     def _convert_key_to_remote_path(self, key):
         key = key.strip("/")
@@ -82,7 +167,15 @@ class AzureBlobStore(DataStore):
     def upload(self, key, src_path):
         remote_path = self._convert_key_to_remote_path(key)
-        self.filesystem.put_file(src_path, remote_path, overwrite=True)
+        container, remote_path = remote_path.split("/", 1)
+        container_client = self.service_client.get_container_client(container=container)
+        with open(file=src_path, mode="rb") as data:
+            container_client.upload_blob(
+                name=remote_path,
+                data=data,
+                overwrite=True,
+                max_concurrency=self.max_concurrency,
+            )
     def get(self, key, size=None, offset=0):
         remote_path = self._convert_key_to_remote_path(key)
@@ -96,12 +189,7 @@ class AzureBlobStore(DataStore):
                 "Append mode not supported for Azure blob datastore"
             )
         remote_path = self._convert_key_to_remote_path(key)
-        if isinstance(data, bytes):
-            mode = "wb"
-        elif isinstance(data, str):
-            mode = "w"
-        else:
-            raise TypeError("Data type unknown.  Unable to put in Azure!")
+        data, mode = self._prepare_put_data(data, append)
         with self.filesystem.open(remote_path, mode) as f:
             f.write(data)
@@ -135,7 +223,7 @@ class AzureBlobStore(DataStore):
     def get_spark_options(self):
         res = {}
-        st = self.get_storage_options()
+        st = self.storage_options
         service = "blob"
         primary_url = None
         if st.get("connection_string"):
@@ -158,18 +246,17 @@ class AzureBlobStore(DataStore):
                     st[key] = parsed_value
         account_name = st.get("account_name")
-        if not account_name:
-            raise mlrun.errors.MLRunInvalidArgumentError(
-                "Property 'account_name' is absent both in storage settings and connection string"
-            )
         if primary_url:
             if primary_url.startswith("http://"):
                 primary_url = primary_url[len("http://") :]
             if primary_url.startswith("https://"):
                 primary_url = primary_url[len("https://") :]
             host = primary_url
-        else:
+        elif account_name:
             host = f"{account_name}.{service}.core.windows.net"
+        else:
+            return res
         if "account_key" in st:
             res[f"spark.hadoop.fs.azure.account.key.{host}"] = st["account_key"]
@@ -209,6 +296,7 @@ class AzureBlobStore(DataStore):
             for key in spark_options:
                 if key.startswith(prefix):
                     account_key = key[len(prefix) :]
-                    url += f"@{account_key}"
+                    if not url.endswith(account_key):
+                        url += f"@{account_key}"
                     break
         return url

mlrun 1.7.0rc4__py3-none-any.whl → 1.7.2__py3-none-any.whl

Potentially problematic release.

mlrun 1.7.0rc4py3-none-any.whl → 1.7.2py3-none-any.whl